Implement slow IDCT using Arm NEON intrinsics

Adds an Arm NEON intrinsics implementation of the 'slow' accurate inverse discrete cosine transform (IDCT). The NEON assembly implementations are removed for both AArch32 and AArch64. Bug: 922430 Change-Id: Ic35eb8f0e5dee0a8a1f71b9ffc303be5e2d3e392
author: Jonathan Wright <jonathan.wright@arm.com> 2019-06-18 16:07:52 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2019-10-19 22:46:19 +0000
commit: bc13578529255ec75005ffc98aae151666122892 (patch)
tree: 9b350ae4c29829d4bcc67a21723d044a47ca4bfb /simd/arm/arm64/jsimd_neon.S
parent: 9d3bf3e9680156c48041c8b90fece504e3539a61 (diff)
download: libjpeg-turbo-bc13578529255ec75005ffc98aae151666122892.tar.gz
1 files changed, 0 insertions, 642 deletions
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 5077a168..4d387416 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -90,648 +90,6 @@ _\fname:
 /*****************************************************************************/
 
 /*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- *                       JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define CONST_BITS  13
-#define PASS1_BITS  2
-
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-#define XFIX_P_0_298  v0.h[0]
-#define XFIX_N_0_390  v0.h[1]
-#define XFIX_P_0_541  v0.h[2]
-#define XFIX_P_0_765  v0.h[3]
-#define XFIX_N_0_899  v0.h[4]
-#define XFIX_P_1_175  v0.h[5]
-#define XFIX_P_1_501  v0.h[6]
-#define XFIX_N_1_847  v0.h[7]
-#define XFIX_N_1_961  v1.h[0]
-#define XFIX_P_2_053  v1.h[1]
-#define XFIX_N_2_562  v1.h[2]
-#define XFIX_P_3_072  v1.h[3]
-
-asm_function jsimd_idct_islow_neon
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x9
-    TMP4            .req x10
-    TMP5            .req x11
-    TMP6            .req x12
-    TMP7            .req x13
-    TMP8            .req x14
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    sub             sp, sp, #64
-    adr             x15, Ljsimd_idct_islow_neon_consts
-    mov             x10, sp
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
-    ld1             {v0.8h, v1.8h}, [x15]
-    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
-    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
-    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
-    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
-
-    cmeq            v16.8h, v3.8h, #0
-    cmeq            v26.8h, v4.8h, #0
-    cmeq            v27.8h, v5.8h, #0
-    cmeq            v28.8h, v6.8h, #0
-    cmeq            v29.8h, v7.8h, #0
-    cmeq            v30.8h, v8.8h, #0
-    cmeq            v31.8h, v9.8h, #0
-
-    and             v10.16b, v16.16b, v26.16b
-    and             v11.16b, v27.16b, v28.16b
-    and             v12.16b, v29.16b, v30.16b
-    and             v13.16b, v31.16b, v10.16b
-    and             v14.16b, v11.16b, v12.16b
-    mul             v2.8h, v2.8h, v18.8h
-    and             v15.16b, v13.16b, v14.16b
-    shl             v10.8h, v2.8h, #(PASS1_BITS)
-    sqxtn           v16.8b, v15.8h
-    mov             TMP1, v16.d[0]
-    mvn             TMP2, TMP1
-
-    cbnz            TMP2, 2f
-    /* case all AC coeffs are zeros */
-    dup             v2.2d, v10.d[0]
-    dup             v6.2d, v10.d[1]
-    mov             v3.16b, v2.16b
-    mov             v7.16b, v6.16b
-    mov             v4.16b, v2.16b
-    mov             v8.16b, v6.16b
-    mov             v5.16b, v2.16b
-    mov             v9.16b, v6.16b
-1:
-    /* for this transpose, we should organise data like this:
-     * 00, 01, 02, 03, 40, 41, 42, 43
-     * 10, 11, 12, 13, 50, 51, 52, 53
-     * 20, 21, 22, 23, 60, 61, 62, 63
-     * 30, 31, 32, 33, 70, 71, 72, 73
-     * 04, 05, 06, 07, 44, 45, 46, 47
-     * 14, 15, 16, 17, 54, 55, 56, 57
-     * 24, 25, 26, 27, 64, 65, 66, 67
-     * 34, 35, 36, 37, 74, 75, 76, 77
-     */
-    trn1            v28.8h, v2.8h, v3.8h
-    trn1            v29.8h, v4.8h, v5.8h
-    trn1            v30.8h, v6.8h, v7.8h
-    trn1            v31.8h, v8.8h, v9.8h
-    trn2            v16.8h, v2.8h, v3.8h
-    trn2            v17.8h, v4.8h, v5.8h
-    trn2            v18.8h, v6.8h, v7.8h
-    trn2            v19.8h, v8.8h, v9.8h
-    trn1            v2.4s, v28.4s, v29.4s
-    trn1            v6.4s, v30.4s, v31.4s
-    trn1            v3.4s, v16.4s, v17.4s
-    trn1            v7.4s, v18.4s, v19.4s
-    trn2            v4.4s, v28.4s, v29.4s
-    trn2            v8.4s, v30.4s, v31.4s
-    trn2            v5.4s, v16.4s, v17.4s
-    trn2            v9.4s, v18.4s, v19.4s
-    /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
-    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
-    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
-
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-
-    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
-
-    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
-    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
-    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
-    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
-
-    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
-    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
-    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
-    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
-    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
-    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
-    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
-    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
-
-    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
-    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
-    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
-    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
-    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
-    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
-    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
-    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
-
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
-    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
-    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
-    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
-    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
-    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
-    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
-    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
-    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
-    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
-    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
-    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
-    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
-    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
-    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
-    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
-    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
-
-    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    movi            v0.16b, #(CENTERJSAMPLE)
-    /* Prepare pointers (dual-issue with NEON instructions) */
-      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP1, TMP1, OUTPUT_COL
-    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP2, TMP2, OUTPUT_COL
-    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP3, TMP3, OUTPUT_COL
-    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP4, TMP4, OUTPUT_COL
-    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP5, TMP5, OUTPUT_COL
-    add             v16.16b, v28.16b, v0.16b
-      add             TMP6, TMP6, OUTPUT_COL
-    add             v18.16b, v29.16b, v0.16b
-      add             TMP7, TMP7, OUTPUT_COL
-    add             v20.16b, v30.16b, v0.16b
-      add             TMP8, TMP8, OUTPUT_COL
-    add             v22.16b, v31.16b, v0.16b
-
-    /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b, v18.16b
-    trn1            v30.16b, v20.16b, v22.16b
-    trn2            v29.16b, v16.16b, v18.16b
-    trn2            v31.16b, v20.16b, v22.16b
-
-    trn1            v16.8h, v28.8h, v30.8h
-    trn2            v18.8h, v28.8h, v30.8h
-    trn1            v20.8h, v29.8h, v31.8h
-    trn2            v22.8h, v29.8h, v31.8h
-
-    uzp1            v28.4s, v16.4s, v18.4s
-    uzp2            v30.4s, v16.4s, v18.4s
-    uzp1            v29.4s, v20.4s, v22.4s
-    uzp2            v31.4s, v20.4s, v22.4s
-
-    /* Store results to the output buffer */
-    st1             {v28.d}[0], [TMP1]
-    st1             {v29.d}[0], [TMP2]
-    st1             {v28.d}[1], [TMP3]
-    st1             {v29.d}[1], [TMP4]
-    st1             {v30.d}[0], [TMP5]
-    st1             {v31.d}[0], [TMP6]
-    st1             {v30.d}[1], [TMP7]
-    st1             {v31.d}[1], [TMP8]
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
-    blr             x30
-
-.balign 16
-2:
-    mul             v3.8h, v3.8h, v19.8h
-    mul             v4.8h, v4.8h, v20.8h
-    mul             v5.8h, v5.8h, v21.8h
-    add             TMP4, xzr, TMP2, LSL #32
-    mul             v6.8h, v6.8h, v22.8h
-    mul             v7.8h, v7.8h, v23.8h
-    adds            TMP3, xzr, TMP2, LSR #32
-    mul             v8.8h, v8.8h, v24.8h
-    mul             v9.8h, v9.8h, v25.8h
-    b.ne            3f
-    /* Right AC coef is zero */
-    dup             v15.2d, v10.d[1]
-    /* Even part: reverse the even part of the forward DCT. */
-    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
-    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
-
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-
-    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
-
-    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
-    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
-
-    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
-    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
-    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
-    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
-
-    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
-    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
-    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
-    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
-
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
-    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
-    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
-    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
-    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
-    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
-    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
-    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
-    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
-
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    mov             v6.16b, v15.16b
-    mov             v7.16b, v15.16b
-    mov             v8.16b, v15.16b
-    mov             v9.16b, v15.16b
-    b               1b
-
-.balign 16
-3:
-    cbnz            TMP4, 4f
-    /* Left AC coef is zero */
-    dup             v14.2d, v10.d[0]
-    /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
-
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-
-    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
-
-    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
-    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
-    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
-    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
-
-    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
-    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
-    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
-    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
-
-    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
-    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
-    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
-    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
-
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
-    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
-    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
-    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
-    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
-    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
-    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
-    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
-    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
-
-    mov             v2.16b, v14.16b
-    mov             v3.16b, v14.16b
-    mov             v4.16b, v14.16b
-    mov             v5.16b, v14.16b
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    b               1b
-
-.balign 16
-4:
-    /* "No" AC coef is zero */
-    /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
-    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
-    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
-    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
-
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-
-    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
-
-    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
-    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
-    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
-    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
-    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
-
-    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
-    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
-    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
-    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
-    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
-    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
-    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
-    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
-
-    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
-    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
-    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
-    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
-    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
-    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
-    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
-    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
-
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
-    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
-    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
-    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
-    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
-    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
-    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
-    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
-    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
-    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
-    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
-    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
-    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
-    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
-    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
-    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
-    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
-
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    b               1b
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-
-#undef CENTERJSAMPLE
-#undef CONST_BITS
-#undef PASS1_BITS
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_idct_ifast_neon
  *
  * This function contains a fast, not so accurate integer implementation of
author	Jonathan Wright <jonathan.wright@arm.com>	2019-06-18 16:07:52 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2019-10-19 22:46:19 +0000
commit	bc13578529255ec75005ffc98aae151666122892 (patch)
tree	9b350ae4c29829d4bcc67a21723d044a47ca4bfb /simd/arm/arm64/jsimd_neon.S
parent	9d3bf3e9680156c48041c8b90fece504e3539a61 (diff)
download	libjpeg-turbo-bc13578529255ec75005ffc98aae151666122892.tar.gz