aboutsummaryrefslogtreecommitdiff
path: root/simd/arm/arm64/jsimd_neon.S
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2019-06-18 16:07:52 +0100
committerJonathan Wright <jonathan.wright@arm.com>2019-10-19 22:46:19 +0000
commitbc13578529255ec75005ffc98aae151666122892 (patch)
tree9b350ae4c29829d4bcc67a21723d044a47ca4bfb /simd/arm/arm64/jsimd_neon.S
parent9d3bf3e9680156c48041c8b90fece504e3539a61 (diff)
downloadlibjpeg-turbo-bc13578529255ec75005ffc98aae151666122892.tar.gz
Implement slow IDCT using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of the 'slow' accurate inverse discrete cosine transform (IDCT). The NEON assembly implementations are removed for both AArch32 and AArch64. Bug: 922430 Change-Id: Ic35eb8f0e5dee0a8a1f71b9ffc303be5e2d3e392
Diffstat (limited to 'simd/arm/arm64/jsimd_neon.S')
-rw-r--r--simd/arm/arm64/jsimd_neon.S642
1 files changed, 0 insertions, 642 deletions
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 5077a168..4d387416 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -90,648 +90,6 @@ _\fname:
/*****************************************************************************/
/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
-
-asm_function jsimd_idct_islow_neon
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x9
- TMP4 .req x10
- TMP5 .req x11
- TMP6 .req x12
- TMP7 .req x13
- TMP8 .req x14
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- sub sp, sp, #64
- adr x15, Ljsimd_idct_islow_neon_consts
- mov x10, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
- ld1 {v0.8h, v1.8h}, [x15]
- ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
- ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
- ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
- ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
-
- cmeq v16.8h, v3.8h, #0
- cmeq v26.8h, v4.8h, #0
- cmeq v27.8h, v5.8h, #0
- cmeq v28.8h, v6.8h, #0
- cmeq v29.8h, v7.8h, #0
- cmeq v30.8h, v8.8h, #0
- cmeq v31.8h, v9.8h, #0
-
- and v10.16b, v16.16b, v26.16b
- and v11.16b, v27.16b, v28.16b
- and v12.16b, v29.16b, v30.16b
- and v13.16b, v31.16b, v10.16b
- and v14.16b, v11.16b, v12.16b
- mul v2.8h, v2.8h, v18.8h
- and v15.16b, v13.16b, v14.16b
- shl v10.8h, v2.8h, #(PASS1_BITS)
- sqxtn v16.8b, v15.8h
- mov TMP1, v16.d[0]
- mvn TMP2, TMP1
-
- cbnz TMP2, 2f
- /* case all AC coeffs are zeros */
- dup v2.2d, v10.d[0]
- dup v6.2d, v10.d[1]
- mov v3.16b, v2.16b
- mov v7.16b, v6.16b
- mov v4.16b, v2.16b
- mov v8.16b, v6.16b
- mov v5.16b, v2.16b
- mov v9.16b, v6.16b
-1:
- /* for this transpose, we should organise data like this:
- * 00, 01, 02, 03, 40, 41, 42, 43
- * 10, 11, 12, 13, 50, 51, 52, 53
- * 20, 21, 22, 23, 60, 61, 62, 63
- * 30, 31, 32, 33, 70, 71, 72, 73
- * 04, 05, 06, 07, 44, 45, 46, 47
- * 14, 15, 16, 17, 54, 55, 56, 57
- * 24, 25, 26, 27, 64, 65, 66, 67
- * 34, 35, 36, 37, 74, 75, 76, 77
- */
- trn1 v28.8h, v2.8h, v3.8h
- trn1 v29.8h, v4.8h, v5.8h
- trn1 v30.8h, v6.8h, v7.8h
- trn1 v31.8h, v8.8h, v9.8h
- trn2 v16.8h, v2.8h, v3.8h
- trn2 v17.8h, v4.8h, v5.8h
- trn2 v18.8h, v6.8h, v7.8h
- trn2 v19.8h, v8.8h, v9.8h
- trn1 v2.4s, v28.4s, v29.4s
- trn1 v6.4s, v30.4s, v31.4s
- trn1 v3.4s, v16.4s, v17.4s
- trn1 v7.4s, v18.4s, v19.4s
- trn2 v4.4s, v28.4s, v29.4s
- trn2 v8.4s, v30.4s, v31.4s
- trn2 v5.4s, v16.4s, v17.4s
- trn2 v9.4s, v18.4s, v19.4s
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- movi v0.16b, #(CENTERJSAMPLE)
- /* Prepare pointers (dual-issue with NEON instructions) */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP1, TMP1, OUTPUT_COL
- sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP2, TMP2, OUTPUT_COL
- sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP3, TMP3, OUTPUT_COL
- sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP4, TMP4, OUTPUT_COL
- sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP5, TMP5, OUTPUT_COL
- add v16.16b, v28.16b, v0.16b
- add TMP6, TMP6, OUTPUT_COL
- add v18.16b, v29.16b, v0.16b
- add TMP7, TMP7, OUTPUT_COL
- add v20.16b, v30.16b, v0.16b
- add TMP8, TMP8, OUTPUT_COL
- add v22.16b, v31.16b, v0.16b
-
- /* Transpose the final 8-bit samples */
- trn1 v28.16b, v16.16b, v18.16b
- trn1 v30.16b, v20.16b, v22.16b
- trn2 v29.16b, v16.16b, v18.16b
- trn2 v31.16b, v20.16b, v22.16b
-
- trn1 v16.8h, v28.8h, v30.8h
- trn2 v18.8h, v28.8h, v30.8h
- trn1 v20.8h, v29.8h, v31.8h
- trn2 v22.8h, v29.8h, v31.8h
-
- uzp1 v28.4s, v16.4s, v18.4s
- uzp2 v30.4s, v16.4s, v18.4s
- uzp1 v29.4s, v20.4s, v22.4s
- uzp2 v31.4s, v20.4s, v22.4s
-
- /* Store results to the output buffer */
- st1 {v28.d}[0], [TMP1]
- st1 {v29.d}[0], [TMP2]
- st1 {v28.d}[1], [TMP3]
- st1 {v29.d}[1], [TMP4]
- st1 {v30.d}[0], [TMP5]
- st1 {v31.d}[0], [TMP6]
- st1 {v30.d}[1], [TMP7]
- st1 {v31.d}[1], [TMP8]
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
- blr x30
-
-.balign 16
-2:
- mul v3.8h, v3.8h, v19.8h
- mul v4.8h, v4.8h, v20.8h
- mul v5.8h, v5.8h, v21.8h
- add TMP4, xzr, TMP2, LSL #32
- mul v6.8h, v6.8h, v22.8h
- mul v7.8h, v7.8h, v23.8h
- adds TMP3, xzr, TMP2, LSR #32
- mul v8.8h, v8.8h, v24.8h
- mul v9.8h, v9.8h, v25.8h
- b.ne 3f
- /* Right AC coef is zero */
- dup v15.2d, v10.d[1]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- mov v6.16b, v15.16b
- mov v7.16b, v15.16b
- mov v8.16b, v15.16b
- mov v9.16b, v15.16b
- b 1b
-
-.balign 16
-3:
- cbnz TMP4, 4f
- /* Left AC coef is zero */
- dup v14.2d, v10.d[0]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- mov v2.16b, v14.16b
- mov v3.16b, v14.16b
- mov v4.16b, v14.16b
- mov v5.16b, v14.16b
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
-.balign 16
-4:
- /* "No" AC coef is zero */
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
-
-#undef CENTERJSAMPLE
-#undef CONST_BITS
-#undef PASS1_BITS
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
* jsimd_idct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of