aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2019-06-18 16:07:52 +0100
committerJonathan Wright <jonathan.wright@arm.com>2019-10-19 22:46:19 +0000
commitbc13578529255ec75005ffc98aae151666122892 (patch)
tree9b350ae4c29829d4bcc67a21723d044a47ca4bfb /simd
parent9d3bf3e9680156c48041c8b90fece504e3539a61 (diff)
downloadlibjpeg-turbo-bc13578529255ec75005ffc98aae151666122892.tar.gz
Implement slow IDCT using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of the 'slow' accurate inverse discrete cosine transform (IDCT). The NEON assembly implementations are removed for both AArch32 and AArch64. Bug: 922430 Change-Id: Ic35eb8f0e5dee0a8a1f71b9ffc303be5e2d3e392
Diffstat (limited to 'simd')
-rw-r--r--simd/arm/arm/jsimd_neon.S607
-rw-r--r--simd/arm/arm64/jsimd_neon.S642
-rw-r--r--simd/arm/common/jidctint-neon.c747
3 files changed, 747 insertions, 1249 deletions
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 9d567233..d19e619e 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,613 +65,6 @@ _\fname:
/*****************************************************************************/
/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
- DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
- JLONG q1, q2, q3, q4, q5, q6, q7; \
- JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
- \
- /* 1-D iDCT input data */ \
- row0 = xrow0; \
- row1 = xrow1; \
- row2 = xrow2; \
- row3 = xrow3; \
- row4 = xrow4; \
- row5 = xrow5; \
- row6 = xrow6; \
- row7 = xrow7; \
- \
- q5 = row7 + row3; \
- q4 = row5 + row1; \
- q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
- MULTIPLY(q4, FIX_1_175875602); \
- q7 = MULTIPLY(q5, FIX_1_175875602) + \
- MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
- q2 = MULTIPLY(row2, FIX_0_541196100) + \
- MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
- q4 = q6; \
- q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
- q6 += MULTIPLY(row5, -FIX_2_562915447) + \
- MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
- /* now we can use q1 (reloadable constants have been used up) */ \
- q1 = q3 + q2; \
- q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
- MULTIPLY(row1, -FIX_0_899976223); \
- q5 = q7; \
- q1 = q1 + q6; \
- q7 += MULTIPLY(row7, -FIX_0_899976223) + \
- MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
- \
- /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
- tmp11_plus_tmp2 = q1; \
- row1 = 0; \
- \
- q1 = q1 - q6; \
- q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
- MULTIPLY(row3, -FIX_2_562915447); \
- q1 = q1 - q6; \
- q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
- MULTIPLY(row6, FIX_0_541196100); \
- q3 = q3 - q2; \
- \
- /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
- tmp11_minus_tmp2 = q1; \
- \
- q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
- q2 = q1 + q6; \
- q1 = q1 - q6; \
- \
- /* pick up the results */ \
- tmp0 = q4; \
- tmp1 = q5; \
- tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
- tmp3 = q7; \
- tmp10 = q2; \
- tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
- tmp12 = q3; \
- tmp13 = q1; \
-}
-
-#define XFIX_0_899976223 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_2_562915447 d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
-#define XFIX_1_175875602 d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
- .short FIX_0_899976223 /* d0[0] */
- .short FIX_0_541196100 /* d0[1] */
- .short FIX_2_562915447 /* d0[2] */
- .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
- .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
- .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
- .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
- .short FIX_1_175875602 /* d1[3] */
- /* reloadable constants */
- .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
- .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
- .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
- .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- ROW0L .req d16
- ROW0R .req d17
- ROW1L .req d18
- ROW1R .req d19
- ROW2L .req d20
- ROW2R .req d21
- ROW3L .req d22
- ROW3R .req d23
- ROW4L .req d24
- ROW4R .req d25
- ROW5L .req d26
- ROW5R .req d27
- ROW6L .req d28
- ROW6R .req d29
- ROW7L .req d30
- ROW7R .req d31
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( q8 )
- * 1 | d18 | d19 ( q9 )
- * 2 | d20 | d21 ( q10 )
- * 3 | d22 | d23 ( q11 )
- * 4 | d24 | d25 ( q12 )
- * 5 | d26 | d27 ( q13 )
- * 6 | d28 | d29 ( q14 )
- * 7 | d30 | d31 ( q15 )
- */
- adr ip, jsimd_idct_islow_neon_consts
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
- vmul.s16 q8, q8, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q9, q9, q1
- vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
- vmul.s16 q10, q10, q2
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vmul.s16 q11, q11, q3
- vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
- vmul.s16 q12, q12, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q14, q14, q2
- vmul.s16 q13, q13, q1
- vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
- add ip, ip, #16
- vmul.s16 q15, q15, q3
- vpush {d8-d15} /* save NEON registers */
- /* 1-D IDCT, pass 1, left 4x8 half */
- vadd.s16 d4, ROW7L, ROW3L
- vadd.s16 d5, ROW5L, ROW1L
- vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d5, XFIX_1_175875602
- vmull.s16 q7, d4, XFIX_1_175875602
- /* Check for the zero coefficients in the right 4x8 half */
- push {r4, r5}
- vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW4L
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
- orr r0, r4, r5
- vmov q4, q6
- vmlsl.s16 q6, ROW5L, XFIX_2_562915447
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- orr r0, r0, r4
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- orr r0, r0, r5
- vadd.s32 q1, q3, q2
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
- vmov q5, q7
- vadd.s32 q1, q1, q6
- orr r0, r0, r4
- vmlsl.s16 q7, ROW7L, XFIX_0_899976223
- orr r0, r0, r5
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1L, q1, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
- orr r0, r0, r4
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- orr r0, r0, r5
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
- vmlal.s16 q6, ROW6L, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- orr r0, r0, r4
- vrshrn.s32 ROW6L, q1, #11
- orr r0, r0, r5
- vadd.s32 q1, q3, q5
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW4L
- orr r0, r0, r4
- vrshrn.s32 ROW2L, q1, #11
- orr r0, r0, r5
- vrshrn.s32 ROW5L, q3, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
- orr r0, r0, r4
- vadd.s32 q2, q5, q6
- orrs r0, r0, r5
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- orr r0, r4, r5
- vsub.s32 q3, q1, q4
- pop {r4, r5}
- vrshrn.s32 ROW7L, q2, #11
- vrshrn.s32 ROW3L, q5, #11
- vrshrn.s32 ROW0L, q6, #11
- vrshrn.s32 ROW4L, q3, #11
-
- beq 3f /* Go to do some special handling for the sparse
- right 4x8 half */
-
- /* 1-D IDCT, pass 1, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vadd.s16 d10, ROW7R, ROW3R
- vadd.s16 d8, ROW5R, ROW1R
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d8, XFIX_1_175875602
- vtrn.16 ROW2L, ROW3L
- vmull.s16 q7, d10, XFIX_1_175875602
- vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
- vtrn.16 ROW0L, ROW1L
- vsubl.s16 q3, ROW0R, ROW4R
- vmull.s16 q2, ROW2R, XFIX_0_541196100
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vtrn.16 ROW4L, ROW5L
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
- vtrn.32 ROW1L, ROW3L
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1R, XFIX_0_899976223
- vtrn.32 ROW4L, ROW6L
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vtrn.32 ROW0L, ROW2L
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1R, q1, #11
- vtrn.32 ROW5L, ROW7L
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW3R, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vrshrn.s32 ROW6R, q1, #11
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0R, ROW4R
- vrshrn.s32 ROW2R, q1, #11
- vrshrn.s32 ROW5R, q3, #11
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vrshrn.s32 ROW7R, q2, #11
- vrshrn.s32 ROW3R, q5, #11
- vrshrn.s32 ROW0R, q6, #11
- vrshrn.s32 ROW4R, q3, #11
- /* Transpose right 4x8 half */
- vtrn.16 ROW6R, ROW7R
- vtrn.16 ROW2R, ROW3R
- vtrn.16 ROW0R, ROW1R
- vtrn.16 ROW4R, ROW5R
- vtrn.32 ROW1R, ROW3R
- vtrn.32 ROW4R, ROW6R
- vtrn.32 ROW0R, ROW2R
- vtrn.32 ROW5R, ROW7R
-
-1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
- vmov q4, q6
- vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5R, XFIX_1_175875602
- vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmull.s16 q7, ROW7R, XFIX_1_175875602
- vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
-
-2: /* Descale to 8-bit and range limit */
- vqrshrn.s16 d16, q8, #2
- vqrshrn.s16 d17, q9, #2
- vqrshrn.s16 d18, q10, #2
- vqrshrn.s16 d19, q11, #2
- vpop {d8-d15} /* restore NEON registers */
- vqrshrn.s16 d20, q12, #2
- /* Transpose the final 8-bit samples and do signed->unsigned conversion */
- vtrn.16 q8, q9
- vqrshrn.s16 d21, q13, #2
- vqrshrn.s16 d22, q14, #2
- vmov.u8 q0, #(CENTERJSAMPLE)
- vqrshrn.s16 d23, q15, #2
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- vadd.u8 q8, q8, q0
- vadd.u8 q9, q9, q0
- vtrn.16 q10, q11
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vadd.u8 q10, q10, q0
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vtrn.8 d22, d23
- vst1.8 {d20}, [TMP1]
- vadd.u8 q11, q11, q0
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
- bx lr
-
-3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vtrn.16 ROW2L, ROW3L
- vtrn.16 ROW0L, ROW1L
- vtrn.16 ROW4L, ROW5L
- vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
- vtrn.32 ROW1L, ROW3L
- vtrn.32 ROW4L, ROW6L
- vtrn.32 ROW0L, ROW2L
- vtrn.32 ROW5L, ROW7L
-
- cmp r0, #0
- beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
- pass */
-
- /* Only row 0 is non-zero for the right 4x8 half */
- vdup.s16 ROW1R, ROW0R[1]
- vdup.s16 ROW2R, ROW0R[2]
- vdup.s16 ROW3R, ROW0R[3]
- vdup.s16 ROW4R, ROW0R[0]
- vdup.s16 ROW5R, ROW0R[1]
- vdup.s16 ROW6R, ROW0R[2]
- vdup.s16 ROW7R, ROW0R[3]
- vdup.s16 ROW0R, ROW0R[0]
- b 1b /* Go to 'normal' second pass */
-
-4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vshll.s16 q3, ROW0L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW0L, #13
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5L, XFIX_1_175875602
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW7L, XFIX_1_175875602
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW6L, XFIX_0_541196100
- vshll.s16 q3, ROW4L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW4L, #13
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
- b 2b /* Go to epilogue */
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
- .unreq ROW0L
- .unreq ROW0R
- .unreq ROW1L
- .unreq ROW1R
- .unreq ROW2L
- .unreq ROW2R
- .unreq ROW3L
- .unreq ROW3R
- .unreq ROW4L
- .unreq ROW4R
- .unreq ROW5L
- .unreq ROW5R
- .unreq ROW6L
- .unreq ROW6R
- .unreq ROW7L
- .unreq ROW7R
-
-
-/*****************************************************************************/
-
-/*
* jsimd_idct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 5077a168..4d387416 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -90,648 +90,6 @@ _\fname:
/*****************************************************************************/
/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
-
-asm_function jsimd_idct_islow_neon
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x9
- TMP4 .req x10
- TMP5 .req x11
- TMP6 .req x12
- TMP7 .req x13
- TMP8 .req x14
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- sub sp, sp, #64
- adr x15, Ljsimd_idct_islow_neon_consts
- mov x10, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
- ld1 {v0.8h, v1.8h}, [x15]
- ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
- ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
- ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
- ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
-
- cmeq v16.8h, v3.8h, #0
- cmeq v26.8h, v4.8h, #0
- cmeq v27.8h, v5.8h, #0
- cmeq v28.8h, v6.8h, #0
- cmeq v29.8h, v7.8h, #0
- cmeq v30.8h, v8.8h, #0
- cmeq v31.8h, v9.8h, #0
-
- and v10.16b, v16.16b, v26.16b
- and v11.16b, v27.16b, v28.16b
- and v12.16b, v29.16b, v30.16b
- and v13.16b, v31.16b, v10.16b
- and v14.16b, v11.16b, v12.16b
- mul v2.8h, v2.8h, v18.8h
- and v15.16b, v13.16b, v14.16b
- shl v10.8h, v2.8h, #(PASS1_BITS)
- sqxtn v16.8b, v15.8h
- mov TMP1, v16.d[0]
- mvn TMP2, TMP1
-
- cbnz TMP2, 2f
- /* case all AC coeffs are zeros */
- dup v2.2d, v10.d[0]
- dup v6.2d, v10.d[1]
- mov v3.16b, v2.16b
- mov v7.16b, v6.16b
- mov v4.16b, v2.16b
- mov v8.16b, v6.16b
- mov v5.16b, v2.16b
- mov v9.16b, v6.16b
-1:
- /* for this transpose, we should organise data like this:
- * 00, 01, 02, 03, 40, 41, 42, 43
- * 10, 11, 12, 13, 50, 51, 52, 53
- * 20, 21, 22, 23, 60, 61, 62, 63
- * 30, 31, 32, 33, 70, 71, 72, 73
- * 04, 05, 06, 07, 44, 45, 46, 47
- * 14, 15, 16, 17, 54, 55, 56, 57
- * 24, 25, 26, 27, 64, 65, 66, 67
- * 34, 35, 36, 37, 74, 75, 76, 77
- */
- trn1 v28.8h, v2.8h, v3.8h
- trn1 v29.8h, v4.8h, v5.8h
- trn1 v30.8h, v6.8h, v7.8h
- trn1 v31.8h, v8.8h, v9.8h
- trn2 v16.8h, v2.8h, v3.8h
- trn2 v17.8h, v4.8h, v5.8h
- trn2 v18.8h, v6.8h, v7.8h
- trn2 v19.8h, v8.8h, v9.8h
- trn1 v2.4s, v28.4s, v29.4s
- trn1 v6.4s, v30.4s, v31.4s
- trn1 v3.4s, v16.4s, v17.4s
- trn1 v7.4s, v18.4s, v19.4s
- trn2 v4.4s, v28.4s, v29.4s
- trn2 v8.4s, v30.4s, v31.4s
- trn2 v5.4s, v16.4s, v17.4s
- trn2 v9.4s, v18.4s, v19.4s
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- movi v0.16b, #(CENTERJSAMPLE)
- /* Prepare pointers (dual-issue with NEON instructions) */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP1, TMP1, OUTPUT_COL
- sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP2, TMP2, OUTPUT_COL
- sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP3, TMP3, OUTPUT_COL
- sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP4, TMP4, OUTPUT_COL
- sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP5, TMP5, OUTPUT_COL
- add v16.16b, v28.16b, v0.16b
- add TMP6, TMP6, OUTPUT_COL
- add v18.16b, v29.16b, v0.16b
- add TMP7, TMP7, OUTPUT_COL
- add v20.16b, v30.16b, v0.16b
- add TMP8, TMP8, OUTPUT_COL
- add v22.16b, v31.16b, v0.16b
-
- /* Transpose the final 8-bit samples */
- trn1 v28.16b, v16.16b, v18.16b
- trn1 v30.16b, v20.16b, v22.16b
- trn2 v29.16b, v16.16b, v18.16b
- trn2 v31.16b, v20.16b, v22.16b
-
- trn1 v16.8h, v28.8h, v30.8h
- trn2 v18.8h, v28.8h, v30.8h
- trn1 v20.8h, v29.8h, v31.8h
- trn2 v22.8h, v29.8h, v31.8h
-
- uzp1 v28.4s, v16.4s, v18.4s
- uzp2 v30.4s, v16.4s, v18.4s
- uzp1 v29.4s, v20.4s, v22.4s
- uzp2 v31.4s, v20.4s, v22.4s
-
- /* Store results to the output buffer */
- st1 {v28.d}[0], [TMP1]
- st1 {v29.d}[0], [TMP2]
- st1 {v28.d}[1], [TMP3]
- st1 {v29.d}[1], [TMP4]
- st1 {v30.d}[0], [TMP5]
- st1 {v31.d}[0], [TMP6]
- st1 {v30.d}[1], [TMP7]
- st1 {v31.d}[1], [TMP8]
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
- blr x30
-
-.balign 16
-2:
- mul v3.8h, v3.8h, v19.8h
- mul v4.8h, v4.8h, v20.8h
- mul v5.8h, v5.8h, v21.8h
- add TMP4, xzr, TMP2, LSL #32
- mul v6.8h, v6.8h, v22.8h
- mul v7.8h, v7.8h, v23.8h
- adds TMP3, xzr, TMP2, LSR #32
- mul v8.8h, v8.8h, v24.8h
- mul v9.8h, v9.8h, v25.8h
- b.ne 3f
- /* Right AC coef is zero */
- dup v15.2d, v10.d[1]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- mov v6.16b, v15.16b
- mov v7.16b, v15.16b
- mov v8.16b, v15.16b
- mov v9.16b, v15.16b
- b 1b
-
-.balign 16
-3:
- cbnz TMP4, 4f
- /* Left AC coef is zero */
- dup v14.2d, v10.d[0]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- mov v2.16b, v14.16b
- mov v3.16b, v14.16b
- mov v4.16b, v14.16b
- mov v5.16b, v14.16b
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
-.balign 16
-4:
- /* "No" AC coef is zero */
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
-
-#undef CENTERJSAMPLE
-#undef CONST_BITS
-#undef PASS1_BITS
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
* jsimd_idct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of
diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c
new file mode 100644
index 00000000..7fb683b4
--- /dev/null
+++ b/simd/arm/common/jidctint-neon.c
@@ -0,0 +1,747 @@
+/*
+ * jidctint-neon.c - slow IDCT (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile-time. Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
+__attribute__ ((aligned(8))) static int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+ };
+
+/* Forward declaration of regular and sparse IDCT helper functions. */
+
+static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+
+/* Performs dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation 'jpeg_idct_slow' can be found jidctint.c.
+ *
+ * Optimization techniques used for data access:
+ *
+ * In each pass, the inverse DCT is computed on the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure and the
+ * increased granularity allows an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants, and swapping quadrants 1 and 2 (in the
+ * diagram below.) Swapping quadrants is cheap as the second pass can just load
+ * from the other workspace buffer.
+ *
+ * +-------+-------+ +-------+-------+
+ * | | | | | |
+ * | 0 | 1 | | 0 | 2 |
+ * | | | transpose | | |
+ * +-------+-------+ ------> +-------+-------+
+ * | | | | | |
+ * | 2 | 3 | | 1 | 3 |
+ * | | | | | |
+ * +-------+-------+ +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * moving diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, the inverse DCT calculation can be simplified. In this
+ * NEON implementation, on the first pass of the inverse DCT, we test for three
+ * special cases before defaulting to a full 'regular' inverse DCT:
+ *
+ * i) AC and DC coefficients are all zero. (Only tested for the right 4x8
+ * half of the DCT coefficient block.) In this case the inverse DCT result
+ * is all zero. We do no work here, signalling that the 'sparse' case is
+ * required in the second pass.
+ * ii) AC coefficients (all but the top row) are zero. In this case, the value
+ * of the inverse DCT of the AC coefficients is just the DC coefficients.
+ * iii) Coefficients of rows 4, 5, 6 and 7 are all zero. In this case we opt to
+ * execute a 'sparse' simplified inverse DCT.
+ *
+ * In the second pass, only a single special case is tested: whether the the AC
+ * and DC coefficients were all zero in the right 4x8 block in the first pass
+ * (case 'i'). If this is the case, a 'sparse' variant of the second pass
+ * inverse DCT is executed for both the left and right halves of the DCT block.
+ * (The transposition after the first pass would have made the bottom half of
+ * the block all zero.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ int16_t workspace_l[8 * DCTSIZE / 2];
+ int16_t workspace_r[8 * DCTSIZE / 2];
+
+ /* Compute IDCT first pass on left 4x8 coefficient block. */
+ /* Load DCT coefficients in left 4x8 block. */
+ int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+ int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+ int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+ int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+ int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+ int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+ int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+ int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table for left 4x8 block. */
+ int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+ int16x4_t bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ int64_t bitmap_rows_4567 = vreinterpret_s64_s16(bitmap);
+
+ if (bitmap_rows_4567 == 0) {
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t left_ac_bitmap = vreinterpret_s64_s16(bitmap);
+
+ if (left_ac_bitmap == 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l, quadrant);
+ vst4_s16(workspace_r, quadrant);
+ } else {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l, workspace_r);
+ }
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l, workspace_r);
+ }
+
+ /* Compute IDCT first pass on right 4x8 coefficient block.*/
+ /* Load DCT coefficients for right 4x8 block. */
+ row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+ row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+ row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+ row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+ row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+ row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+ row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+ row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+ /* Load quantization table for right 4x8 block. */
+ quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+ quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+ bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ bitmap_rows_4567 = vreinterpret_s64_s16(bitmap);
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t right_ac_bitmap = vreinterpret_s64_s16(bitmap);
+
+ /* Initialise to non-zero value: defaults to regular second pass. */
+ int64_t right_ac_dc_bitmap = 1;
+
+ if (right_ac_bitmap == 0) {
+ bitmap = vorr_s16(bitmap, row0);
+ right_ac_dc_bitmap = vreinterpret_s64_s16(bitmap);
+
+ if (right_ac_dc_bitmap != 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+ vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+ }
+ } else {
+ if (bitmap_rows_4567 == 0) {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ }
+ }
+
+ /* Second pass: compute IDCT on rows in workspace. */
+ /* If all coefficients in right 4x8 block are 0, use 'sparse' second pass. */
+ if (right_ac_dc_bitmap == 0) {
+ jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+ } else {
+ jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+ }
+}
+
+
+/* Performs dequantization and the first pass of the slow-but-accurate inverse
+ * DCT on a 4x8 block of coefficients. (To process the full 8x8 DCT block this
+ * function - or some other optimized variant - needs to be called on both the
+ * right and left 4x8 blocks.)
+ *
+ * This 'regular' version assumes that no optimization can be made to the IDCT
+ * calculation since no useful set of AC coefficients are all 0.
+ *
+ * The original C implementation of the slow IDCT 'jpeg_idct_slow' can be found
+ * in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT calculation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ z3_s16 = vmul_s16(row4, quant_row4);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+ int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ };
+ int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ };
+
+ /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
+ /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Performs dequantization and the first pass of the slow-but-accurate inverse
+ * DCT on a 4x8 block of coefficients.
+ *
+ * This 'sparse' version assumes that the AC coefficients in rows 4, 5, 6 and 7
+ * are all 0. This simplifies the IDCT calculation, accelerating overall
+ * performance.
+ */
+
+static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ /* z3 is all 0. */
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ /* tmp0 and tmp1 are both all 0. */
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ };
+ int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ };
+
+ /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
+ /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
+ * of coefficients. (To process the full 8x8 DCT block this function - or some
+ * other optimized variant - needs to be called on both the right and left 4x8
+ * blocks.)
+ *
+ * This 'regular' version assumes that no optimization can be made to the IDCT
+ * calculation since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the slow IDCT 'jpeg_idct_slow' can
+ * be found in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+ int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(cols_02_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(cols_13_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(cols_46_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(cols_57_s8, vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. */
+ /* Zipping adjacent columns together allows us to store 16-bit elements. */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ };
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
+ cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
+ cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
+ cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
+ cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This 'sparse' version assumes that the coefficient values (after the first
+ * pass) in rows 4, 5, 6 and 7 are all 0. This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ /* z3 is all 0. */
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ /* tmp0 and tmp1 are both all 0. */
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(cols_02_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(cols_13_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(cols_46_s8, vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(cols_57_s8, vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. */
+ /* Zipping adjacent columns together allow us to store 16-bit elements. */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ };
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
+ cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
+ cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
+ cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
+ cols_01_23_45_67, 3);
+}