diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2020-07-01 18:28:44 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2020-09-03 14:46:52 +0100 |
commit | f38898135a3b5097978394dff81c970ad9c4eca2 (patch) | |
tree | 163ad774dc903298debc376d48e45c84d3a0cd21 /simd | |
parent | 628b6a47cc8f16610d1ada02fc5c907652aa735d (diff) | |
download | libjpeg-turbo-f38898135a3b5097978394dff81c970ad9c4eca2.tar.gz |
Implement quantization using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of DCT coefficient
quantization.
Removes the NEON assembly implementations for both AArch32 and
AArch64.
Bug: 922430
Change-Id: I114157f8186e6a2a3b3b78db7869fd55ce7f55b3
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/arm/jsimd_neon.S | 101 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S | 88 | ||||
-rw-r--r-- | simd/arm/common/jquanti-neon.c | 110 |
3 files changed, 109 insertions, 190 deletions
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S index 6565a0d1..8fce4ee1 100644 --- a/simd/arm/arm/jsimd_neon.S +++ b/simd/arm/arm/jsimd_neon.S @@ -188,107 +188,6 @@ asm_function jsimd_fdct_ifast_neon /*****************************************************************************/ /* - * GLOBAL(void) - * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, - * DCTELEM *workspace); - * - * Note: the code uses 2 stage pipelining in order to improve instructions - * scheduling and eliminate stalls (this provides ~15% better - * performance for this function on both ARM Cortex-A8 and - * ARM Cortex-A9 when compared to the non-pipelined variant). - * The instructions which belong to the second stage use different - * indentation for better readiability. - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req r0 - DIVISORS .req r1 - WORKSPACE .req r2 - - RECIPROCAL .req DIVISORS - CORRECTION .req r3 - SHIFT .req ip - LOOP_COUNT .req r4 - - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - vabs.s16 q12, q0 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - - push {r4, r5} - mov LOOP_COUNT, #3 -1: - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - veor.u16 q14, q14, q2 /* restore sign */ - vabs.s16 q12, q0 - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - veor.u16 q15, q15, q3 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vsub.u16 q14, q14, q2 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vsub.u16 q15, q15, q3 - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - subs LOOP_COUNT, LOOP_COUNT, #1 - bne 1b - pop {r4, r5} - - veor.u16 q14, q14, q2 /* restore sign */ - veor.u16 q15, q15, q3 - vsub.u16 q14, q14, q2 - vsub.u16 q15, q15, q3 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - - bx lr /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT - - -/*****************************************************************************/ - -/* * GLOBAL(JOCTET*) * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, * JCOEFPTR block, int last_dc_val, diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index fc60ad46..d76a570d 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -597,94 +597,6 @@ asm_function jsimd_fdct_ifast_neon /*****************************************************************************/ /* - * GLOBAL(void) - * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, - * DCTELEM *workspace); - * - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req x0 - DIVISORS .req x1 - WORKSPACE .req x2 - - RECIPROCAL .req DIVISORS - CORRECTION .req x9 - SHIFT .req x10 - LOOP_COUNT .req x11 - - mov LOOP_COUNT, #2 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) -1: - subs LOOP_COUNT, LOOP_COUNT, #1 - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 - ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 - abs v20.8h, v0.8h - abs v21.8h, v1.8h - abs v22.8h, v2.8h - abs v23.8h, v3.8h - ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 - add v20.8h, v20.8h, v4.8h /* add correction */ - add v21.8h, v21.8h, v5.8h - add v22.8h, v22.8h, v6.8h - add v23.8h, v23.8h, v7.8h - umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ - umull2 v16.4s, v20.8h, v28.8h - umull v5.4s, v21.4h, v29.4h - umull2 v17.4s, v21.8h, v29.8h - umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ - umull2 v18.4s, v22.8h, v30.8h - umull v7.4s, v23.4h, v31.4h - umull2 v19.4s, v23.8h, v31.8h - ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 - shrn v4.4h, v4.4s, #16 - shrn v5.4h, v5.4s, #16 - shrn v6.4h, v6.4s, #16 - shrn v7.4h, v7.4s, #16 - shrn2 v4.8h, v16.4s, #16 - shrn2 v5.8h, v17.4s, #16 - shrn2 v6.8h, v18.4s, #16 - shrn2 v7.8h, v19.4s, #16 - neg v24.8h, v24.8h - neg v25.8h, v25.8h - neg v26.8h, v26.8h - neg v27.8h, v27.8h - sshr v0.8h, v0.8h, #15 /* extract sign */ - sshr v1.8h, v1.8h, #15 - sshr v2.8h, v2.8h, #15 - sshr v3.8h, v3.8h, #15 - ushl v4.8h, v4.8h, v24.8h /* shift */ - ushl v5.8h, v5.8h, v25.8h - ushl v6.8h, v6.8h, v26.8h - ushl v7.8h, v7.8h, v27.8h - - eor v4.16b, v4.16b, v0.16b /* restore sign */ - eor v5.16b, v5.16b, v1.16b - eor v6.16b, v6.16b, v2.16b - eor v7.16b, v7.16b, v3.16b - sub v4.8h, v4.8h, v0.8h - sub v5.8h, v5.8h, v1.8h - sub v6.8h, v6.8h, v2.8h - sub v7.8h, v7.8h, v3.8h - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 - - b.ne 1b - - br x30 /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT - - -/*****************************************************************************/ - -/* * GLOBAL(JOCTET *) * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, * JCOEFPTR block, int last_dc_val, diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c index ed0c1b36..6f8a3ab8 100644 --- a/simd/arm/common/jquanti-neon.c +++ b/simd/arm/common/jquanti-neon.c @@ -1,5 +1,5 @@ /* - * jquanti-neon.c - sample quantization (Arm NEON) + * jquanti-neon.c - sample conversion and integer quantization (Arm NEON) * * Copyright 2020 The Chromium Authors. All Rights Reserved. * @@ -80,3 +80,111 @@ void jsimd_convsamp_neon(JSAMPARRAY sample_data, vst1q_s16(workspace + 6 * DCTSIZE, row6); vst1q_s16(workspace + 7 * DCTSIZE, row7); } + + +/* + * After the DCT, the resulting coefficient values need to be divided by a + * quantization value. + * + * To avoid a slow division operation, the DCT coefficients are multiplied by + * the (scaled) reciprocal of the quantization values and then right-shifted. + * + * The equivalent scalar C function 'quantize' can be found in jcdctmgr.c. + */ + +void jsimd_quantize_neon(JCOEFPTR coef_block, + DCTELEM *divisors, + DCTELEM *workspace) +{ + JCOEFPTR out_ptr = coef_block; + UDCTELEM *recip_ptr = (UDCTELEM *)divisors; + UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2; + DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2; + + for (int i = 0; i < DCTSIZE; i += DCTSIZE / 2) { + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE); + int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE); + int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE); + int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE); + /* Load reciprocals of quantization values. */ + uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE); + uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE); + uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE); + uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE); + uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE); + uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE); + uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE); + uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE); + int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE); + int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE); + int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE); + int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE); + + /* Extract sign from coefficients. */ + int16x8_t sign_row0 = vshrq_n_s16(row0, 15); + int16x8_t sign_row1 = vshrq_n_s16(row1, 15); + int16x8_t sign_row2 = vshrq_n_s16(row2, 15); + int16x8_t sign_row3 = vshrq_n_s16(row3, 15); + /* Get absolute value of DCT coefficients. */ + uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0)); + uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1)); + uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2)); + uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3)); + /* Add correction. */ + abs_row0 = vaddq_u16(abs_row0, corr0); + abs_row1 = vaddq_u16(abs_row1, corr1); + abs_row2 = vaddq_u16(abs_row2, corr2); + abs_row3 = vaddq_u16(abs_row3, corr3); + + /* Multiply DCT coefficients by quantization reciprocal. */ + int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0), + vget_low_u16(recip0))); + int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0), + vget_high_u16(recip0))); + int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1), + vget_low_u16(recip1))); + int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1), + vget_high_u16(recip1))); + int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2), + vget_low_u16(recip2))); + int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2), + vget_high_u16(recip2))); + int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3), + vget_low_u16(recip3))); + int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3), + vget_high_u16(recip3))); + /* Narrow back to 16-bit. */ + row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16)); + row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16)); + row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16)); + row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16)); + + /* Since VSHR only supports an immediate as its second argument, negate */ + /* the shift value and shift left. */ + row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0), + vnegq_s16(shift0))); + row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1), + vnegq_s16(shift1))); + row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2), + vnegq_s16(shift2))); + row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3), + vnegq_s16(shift3))); + + /* Restore sign to original product. */ + row0 = veorq_s16(row0, sign_row0); + row0 = vsubq_s16(row0, sign_row0); + row1 = veorq_s16(row1, sign_row1); + row1 = vsubq_s16(row1, sign_row1); + row2 = veorq_s16(row2, sign_row2); + row2 = vsubq_s16(row2, sign_row2); + row3 = veorq_s16(row3, sign_row3); + row3 = vsubq_s16(row3, sign_row3); + + /* Store quantized coefficients to memory. */ + vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0); + vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1); + vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2); + vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3); + } +} |