diff options
author | DRC <dcommander@users.sourceforge.net> | 2014-05-14 15:00:10 +0000 |
---|---|---|
committer | DRC <dcommander@users.sourceforge.net> | 2014-05-14 15:00:10 +0000 |
commit | 6a61c1e6dc0263148e8e8e1a13da8624cc5a065f (patch) | |
tree | 58cf49c8ce818ee310b5a3b81cdb753adf9f5c1e | |
parent | 1e9cbbad8a04b572c1777c9bebdf4209f9998f89 (diff) | |
download | libjpeg-turbo-6a61c1e6dc0263148e8e8e1a13da8624cc5a065f.tar.gz |
SIMD-accelerated h2v2 smooth downsampling routine for MIPS DSPr2
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1301 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r-- | jcsample.c | 5 | ||||
-rw-r--r-- | jsimd.h | 7 | ||||
-rw-r--r-- | jsimd_none.c | 12 | ||||
-rw-r--r-- | simd/jsimd.h | 5 | ||||
-rw-r--r-- | simd/jsimd_mips.c | 28 | ||||
-rw-r--r-- | simd/jsimd_mips_dspr2.S | 300 |
6 files changed, 356 insertions, 1 deletions
@@ -504,7 +504,10 @@ jinit_downsampler (j_compress_ptr cinfo) compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) { #ifdef INPUT_SMOOTHING_SUPPORTED if (cinfo->smoothing_factor) { - downsample->methods[ci] = h2v2_smooth_downsample; + if (jsimd_can_h2v2_smooth_downsample()) + downsample->methods[ci] = jsimd_h2v2_smooth_downsample; + else + downsample->methods[ci] = h2v2_smooth_downsample; downsample->pub.need_context_rows = TRUE; } else { #endif @@ -60,6 +60,13 @@ EXTERN(int) jsimd_can_h2v1_downsample JPP((void)); EXTERN(void) jsimd_h2v2_downsample JPP((j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data)); + +EXTERN(int) jsimd_can_h2v2_smooth_downsample JPP((void)); + +EXTERN(void) jsimd_h2v2_smooth_downsample + JPP((j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data)); + EXTERN(void) jsimd_h2v1_downsample JPP((j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data)); diff --git a/jsimd_none.c b/jsimd_none.c index 7b2e9651..a6e82759 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -69,6 +69,12 @@ jsimd_can_h2v1_downsample (void) return 0; } +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample (void) +{ + return 0; +} + GLOBAL(void) jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) @@ -76,6 +82,12 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, } GLOBAL(void) +jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { diff --git a/simd/jsimd.h b/simd/jsimd.h index dc227edd..7067a2d7 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -498,6 +498,11 @@ EXTERN(void) jsimd_h2v2_downsample_mips_dspr2 JPP((JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data)); +EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2 + JPP((JSAMPARRAY input_data, JSAMPARRAY output_data, + JDIMENSION v_samp_factor, int max_v_samp_factor, + int smoothing_factor, JDIMENSION width_blocks, + JDIMENSION image_width)); EXTERN(void) jsimd_h2v1_downsample_mips_dspr2 JPP((JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c index 63a25cb9..aebd549a 100644 --- a/simd/jsimd_mips.c +++ b/simd/jsimd_mips.c @@ -279,6 +279,24 @@ jsimd_can_h2v2_downsample (void) } GLOBAL(int) +jsimd_can_h2v2_smooth_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if(DCTSIZE != 8) + return 0; + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) jsimd_can_h2v1_downsample (void) { init_simd(); @@ -305,6 +323,16 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, } GLOBAL(void) +jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data, + compptr->v_samp_factor, cinfo->max_v_samp_factor, + cinfo->smoothing_factor, compptr->width_in_blocks, + cinfo->image_width); +} + +GLOBAL(void) jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S index d7d76f7c..725b5589 100644 --- a/simd/jsimd_mips_dspr2.S +++ b/simd/jsimd_mips_dspr2.S @@ -1210,6 +1210,306 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) nop END(jsimd_h2v2_downsample_mips_dspr2) /*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) +/* + * a0 - input_data + * a1 - output_data + * a2 - compptr->v_samp_factor + * a3 - cinfo->max_v_samp_factor + * 16(sp) - cinfo->smoothing_factor + * 20(sp) - compptr->width_in_blocks + * 24(sp) - cinfo->image_width + */ + + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s7, 52(sp) // compptr->width_in_blocks + lw s0, 56(sp) // cinfo->image_width + lw s6, 48(sp) // cinfo->smoothing_factor + sll s7, 3 // output_cols = width_in_blocks * DCTSIZE + sll v0, s7, 1 + subu v0, v0, s0 + blez v0, 2f + move v1, zero + addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 +0: + addiu t1, a0, -4 + sll t2, v1, 2 + lwx t1, t2(t1) + move t3, v0 + addu t1, t1, s0 + lbu t2, -1(t1) +1: + addiu t3, t3, -1 + sb t2, 0(t1) + bgtz t3, 1b + addiu t1, t1, 1 + addiu v1, v1, 1 + bne v1, t0, 0b + nop +2: + li v0, 80 + mul v0, s6, v0 + li v1, 16384 + move t4, zero + move t5, zero + subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 + sll t7, s6, 4 // t7 = tmp_smoot_f * 16 +3: +/* Special case for first column: pretend column -1 is same as column 0 */ + sll v0, t4, 2 + lwx t8, v0(a1) // outptr = output_data[outrow] + sll v1, t5, 2 + addiu t9, v1, 4 + addiu s0, v1, -4 + addiu s1, v1, 8 + lwx s2, v1(a0) // inptr0 = input_data[inrow] + lwx t9, t9(a0) // inptr1 = input_data[inrow+1] + lwx s0, s0(a0) // above_ptr = input_data[inrow-1] + lwx s1, s1(a0) // below_ptr = input_data[inrow+2] + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, 0(s2) + lbu v1, 2(s2) + lbu t0, 0(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1,t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, 0(s0) + lbu t0, 0(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1,s3, t7 + extr_r.w v0, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + addiu s1, s1, 2 + sb v0, -1(t8) + addiu s4, s7, -2 + and s4, s4, 3 + addu s5, s4, t8 //end adress +4: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + sb t2, -1(t8) + bne s5, t8, 4b + addiu s1, s1, 2 + addiu s5, s7, -2 + subu s5, s5, s4 + addu s5, s5, t8 //end adress +5: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + lh v1, 2(t9) + addu t0, t0, v0 + lh v0, 2(s2) + addu s3, t0, s3 + lh t0, 2(s0) + lh t1, 2(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 4(s2) + lbu t0, 1(t9) + lbu t1, 4(t9) + sb t2, 0(t8) + raddu.w.qb t3, v0 + lbu v0, 1(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 4(s0) + addu t0, t0, v0 + lbu v0, 1(s0) + addu s3, t0, s3 + lbu t0, 1(s1) + lbu t3, 4(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 4(t9) + addu t0, t0, v0 + lh v0, 4(s2) + addu s3, t0, s3 + lh t0, 4(s0) + lh t1, 4(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 6(s2) + lbu t0, 3(t9) + lbu t1, 6(t9) + sb t2, 1(t8) + raddu.w.qb t3, v0 + lbu v0, 3(s2) + addu t0, t0,t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 6(s0) + addu t0, t0, v0 + lbu v0, 3(s0) + addu s3, t0, s3 + lbu t0, 3(s1) + lbu t3, 6(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 6(t9) + addu t0, t0, v0 + lh v0, 6(s2) + addu s3, t0, s3 + lh t0, 6(s0) + lh t1, 6(s1) + madd $ac1, s3, t7 + extr_r.w t3, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 8(s2) + lbu t0, 5(t9) + lbu t1, 8(t9) + sb t3, 2(t8) + raddu.w.qb t2, v0 + lbu v0, 5(s2) + addu t0, t0, t1 + mult $ac1, t2, t6 + addu v0, v0, v1 + lbu t2, 8(s0) + addu t0, t0, v0 + lbu v0, 5(s0) + addu s3, t0, s3 + lbu t0, 5(s1) + lbu t3, 8(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + addiu t8, t8, 4 + addu t0, t0, v0 + addiu s2, s2, 8 + addu s3, t0, s3 + addiu t9, t9, 8 + madd $ac1, s3, t7 + extr_r.w t1, $ac1, 16 + addiu s0, s0, 8 + addiu s1, s1, 8 + bne s5, t8, 5b + sb t1, -1(t8) +/* Special case for last column */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 1(s2) + lbu t0, -1(t9) + lbu t1, 1(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 1(s0) + addu t0, t0, v0 + lbu t3, 1(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t0, $ac1, 16 + addiu t5, t5, 2 + sb t0, 0(t8) + addiu t4, t4, 1 + bne t4, a2, 3b + addiu t5, t5, 2 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_h2v2_downsample_mips_dspr2) +/*****************************************************************************/ LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) /* * a0 - cinfo->max_v_samp_factor |