aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDRC <dcommander@users.sourceforge.net>2014-05-14 15:00:10 +0000
committerDRC <dcommander@users.sourceforge.net>2014-05-14 15:00:10 +0000
commit6a61c1e6dc0263148e8e8e1a13da8624cc5a065f (patch)
tree58cf49c8ce818ee310b5a3b81cdb753adf9f5c1e
parent1e9cbbad8a04b572c1777c9bebdf4209f9998f89 (diff)
downloadlibjpeg-turbo-6a61c1e6dc0263148e8e8e1a13da8624cc5a065f.tar.gz
SIMD-accelerated h2v2 smooth downsampling routine for MIPS DSPr2
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1301 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r--jcsample.c5
-rw-r--r--jsimd.h7
-rw-r--r--jsimd_none.c12
-rw-r--r--simd/jsimd.h5
-rw-r--r--simd/jsimd_mips.c28
-rw-r--r--simd/jsimd_mips_dspr2.S300
6 files changed, 356 insertions, 1 deletions
diff --git a/jcsample.c b/jcsample.c
index 76dcfcd4..36ae7846 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -504,7 +504,10 @@ jinit_downsampler (j_compress_ptr cinfo)
compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
#ifdef INPUT_SMOOTHING_SUPPORTED
if (cinfo->smoothing_factor) {
- downsample->methods[ci] = h2v2_smooth_downsample;
+ if (jsimd_can_h2v2_smooth_downsample())
+ downsample->methods[ci] = jsimd_h2v2_smooth_downsample;
+ else
+ downsample->methods[ci] = h2v2_smooth_downsample;
downsample->pub.need_context_rows = TRUE;
} else {
#endif
diff --git a/jsimd.h b/jsimd.h
index ae072157..2817137d 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -60,6 +60,13 @@ EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
EXTERN(void) jsimd_h2v2_downsample
JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(int) jsimd_can_h2v2_smooth_downsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_smooth_downsample
+ JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+
EXTERN(void) jsimd_h2v1_downsample
JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data));
diff --git a/jsimd_none.c b/jsimd_none.c
index 7b2e9651..a6e82759 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -69,6 +69,12 @@ jsimd_can_h2v1_downsample (void)
return 0;
}
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample (void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
@@ -76,6 +82,12 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
}
GLOBAL(void)
+jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
diff --git a/simd/jsimd.h b/simd/jsimd.h
index dc227edd..7067a2d7 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -498,6 +498,11 @@ EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
JPP((JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
+ JPP((JSAMPARRAY input_data, JSAMPARRAY output_data,
+ JDIMENSION v_samp_factor, int max_v_samp_factor,
+ int smoothing_factor, JDIMENSION width_blocks,
+ JDIMENSION image_width));
EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
JPP((JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
index 63a25cb9..aebd549a 100644
--- a/simd/jsimd_mips.c
+++ b/simd/jsimd_mips.c
@@ -279,6 +279,24 @@ jsimd_can_h2v2_downsample (void)
}
GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if(DCTSIZE != 8)
+ return 0;
+ if (simd_support & JSIMD_MIPS_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
jsimd_can_h2v1_downsample (void)
{
init_simd();
@@ -305,6 +323,16 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
}
GLOBAL(void)
+jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
+ compptr->v_samp_factor, cinfo->max_v_samp_factor,
+ cinfo->smoothing_factor, compptr->width_in_blocks,
+ cinfo->image_width);
+}
+
+GLOBAL(void)
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
index d7d76f7c..725b5589 100644
--- a/simd/jsimd_mips_dspr2.S
+++ b/simd/jsimd_mips_dspr2.S
@@ -1210,6 +1210,306 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
nop
END(jsimd_h2v2_downsample_mips_dspr2)
/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
+/*
+ * a0 - input_data
+ * a1 - output_data
+ * a2 - compptr->v_samp_factor
+ * a3 - cinfo->max_v_samp_factor
+ * 16(sp) - cinfo->smoothing_factor
+ * 20(sp) - compptr->width_in_blocks
+ * 24(sp) - cinfo->image_width
+ */
+
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s7, 52(sp) // compptr->width_in_blocks
+ lw s0, 56(sp) // cinfo->image_width
+ lw s6, 48(sp) // cinfo->smoothing_factor
+ sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
+ sll v0, s7, 1
+ subu v0, v0, s0
+ blez v0, 2f
+ move v1, zero
+ addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
+0:
+ addiu t1, a0, -4
+ sll t2, v1, 2
+ lwx t1, t2(t1)
+ move t3, v0
+ addu t1, t1, s0
+ lbu t2, -1(t1)
+1:
+ addiu t3, t3, -1
+ sb t2, 0(t1)
+ bgtz t3, 1b
+ addiu t1, t1, 1
+ addiu v1, v1, 1
+ bne v1, t0, 0b
+ nop
+2:
+ li v0, 80
+ mul v0, s6, v0
+ li v1, 16384
+ move t4, zero
+ move t5, zero
+ subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
+ sll t7, s6, 4 // t7 = tmp_smoot_f * 16
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+ sll v0, t4, 2
+ lwx t8, v0(a1) // outptr = output_data[outrow]
+ sll v1, t5, 2
+ addiu t9, v1, 4
+ addiu s0, v1, -4
+ addiu s1, v1, 8
+ lwx s2, v1(a0) // inptr0 = input_data[inrow]
+ lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
+ lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
+ lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, 0(s2)
+ lbu v1, 2(s2)
+ lbu t0, 0(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1,t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, 0(s0)
+ lbu t0, 0(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1,s3, t7
+ extr_r.w v0, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ addiu s1, s1, 2
+ sb v0, -1(t8)
+ addiu s4, s7, -2
+ and s4, s4, 3
+ addu s5, s4, t8 //end adress
+4:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ sb t2, -1(t8)
+ bne s5, t8, 4b
+ addiu s1, s1, 2
+ addiu s5, s7, -2
+ subu s5, s5, s4
+ addu s5, s5, t8 //end adress
+5:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ lh v1, 2(t9)
+ addu t0, t0, v0
+ lh v0, 2(s2)
+ addu s3, t0, s3
+ lh t0, 2(s0)
+ lh t1, 2(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 4(s2)
+ lbu t0, 1(t9)
+ lbu t1, 4(t9)
+ sb t2, 0(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 1(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 4(s0)
+ addu t0, t0, v0
+ lbu v0, 1(s0)
+ addu s3, t0, s3
+ lbu t0, 1(s1)
+ lbu t3, 4(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 4(t9)
+ addu t0, t0, v0
+ lh v0, 4(s2)
+ addu s3, t0, s3
+ lh t0, 4(s0)
+ lh t1, 4(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 6(s2)
+ lbu t0, 3(t9)
+ lbu t1, 6(t9)
+ sb t2, 1(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 3(s2)
+ addu t0, t0,t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 6(s0)
+ addu t0, t0, v0
+ lbu v0, 3(s0)
+ addu s3, t0, s3
+ lbu t0, 3(s1)
+ lbu t3, 6(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 6(t9)
+ addu t0, t0, v0
+ lh v0, 6(s2)
+ addu s3, t0, s3
+ lh t0, 6(s0)
+ lh t1, 6(s1)
+ madd $ac1, s3, t7
+ extr_r.w t3, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 8(s2)
+ lbu t0, 5(t9)
+ lbu t1, 8(t9)
+ sb t3, 2(t8)
+ raddu.w.qb t2, v0
+ lbu v0, 5(s2)
+ addu t0, t0, t1
+ mult $ac1, t2, t6
+ addu v0, v0, v1
+ lbu t2, 8(s0)
+ addu t0, t0, v0
+ lbu v0, 5(s0)
+ addu s3, t0, s3
+ lbu t0, 5(s1)
+ lbu t3, 8(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ addiu t8, t8, 4
+ addu t0, t0, v0
+ addiu s2, s2, 8
+ addu s3, t0, s3
+ addiu t9, t9, 8
+ madd $ac1, s3, t7
+ extr_r.w t1, $ac1, 16
+ addiu s0, s0, 8
+ addiu s1, s1, 8
+ bne s5, t8, 5b
+ sb t1, -1(t8)
+/* Special case for last column */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 1(s2)
+ lbu t0, -1(t9)
+ lbu t1, 1(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 1(s0)
+ addu t0, t0, v0
+ lbu t3, 1(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t0, $ac1, 16
+ addiu t5, t5, 2
+ sb t0, 0(t8)
+ addiu t4, t4, 1
+ bne t4, a2, 3b
+ addiu t5, t5, 2
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_h2v2_downsample_mips_dspr2)
+/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
/*
* a0 - cinfo->max_v_samp_factor