aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2020-07-01 19:48:29 +0100
committerJonathan Wright <jonathan.wright@arm.com>2020-09-11 13:02:25 +0100
commit8ca19625de302b0af290e3e0cdeff24a1c272d39 (patch)
tree13efa986ebdfd2b3613d791774501928a48ee156 /simd
parent83889e4e3975107f6d5a51d6a2b3371b22ac0559 (diff)
downloadlibjpeg-turbo-8ca19625de302b0af290e3e0cdeff24a1c272d39.tar.gz
Implement fast DCT using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of the 'fast' not-so- accurate Discrete Cosine Transform (DCT). Removes the NEON assembly implementations for both AArch32 and AArch64. Bug: 922430 Change-Id: I9f37cd82dee36e4e1d237a031e591bac339e9239
Diffstat (limited to 'simd')
-rw-r--r--simd/arm/arm/jsimd_neon.S123
-rw-r--r--simd/arm/arm64/jsimd_neon.S113
-rw-r--r--simd/arm/common/jfdctfst-neon.c211
3 files changed, 211 insertions, 236 deletions
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 8fce4ee1..2c453246 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,129 +65,6 @@ _\fname:
/*****************************************************************************/
/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req r0
- TMP .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP, jsimd_fdct_ifast_neon_consts
- vld1.16 {d0}, [TMP, :64]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | q8
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vld1.16 {d28, d29, d30, d31}, [DATA, :128]
- sub DATA, DATA, #(128 - 32)
-
- mov TMP, #2
-1:
- /* Transpose */
- vtrn.16 q12, q13
- vtrn.16 q10, q11
- vtrn.16 q8, q9
- vtrn.16 q14, q15
- vtrn.32 q9, q11
- vtrn.32 q13, q15
- vtrn.32 q8, q10
- vtrn.32 q12, q14
- vswp d30, d23
- vswp d24, d17
- vswp d26, d19
- /* 1-D FDCT */
- vadd.s16 q2, q11, q12
- vswp d28, d21
- vsub.s16 q12, q11, q12
- vsub.s16 q6, q10, q13
- vadd.s16 q10, q10, q13
- vsub.s16 q7, q9, q14
- vadd.s16 q9, q9, q14
- vsub.s16 q1, q8, q15
- vadd.s16 q8, q8, q15
- vsub.s16 q4, q9, q10
- vsub.s16 q5, q8, q2
- vadd.s16 q3, q9, q10
- vadd.s16 q4, q4, q5
- vadd.s16 q2, q8, q2
- vqdmulh.s16 q4, q4, XFIX_0_707106781
- vadd.s16 q11, q12, q6
- vadd.s16 q8, q2, q3
- vsub.s16 q12, q2, q3
- vadd.s16 q3, q6, q7
- vadd.s16 q7, q7, q1
- vqdmulh.s16 q3, q3, XFIX_0_707106781
- vsub.s16 q6, q11, q7
- vadd.s16 q10, q5, q4
- vqdmulh.s16 q6, q6, XFIX_0_382683433
- vsub.s16 q14, q5, q4
- vqdmulh.s16 q11, q11, XFIX_0_541196100
- vqdmulh.s16 q5, q7, XFIX_1_306562965
- vadd.s16 q4, q1, q3
- vsub.s16 q3, q1, q3
- vadd.s16 q7, q7, q6
- vadd.s16 q11, q11, q6
- vadd.s16 q7, q7, q5
- vadd.s16 q13, q3, q11
- vsub.s16 q11, q3, q11
- vadd.s16 q9, q4, q7
- vsub.s16 q15, q4, q7
- subs TMP, TMP, #1
- bne 1b
-
- /* store results */
- vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vst1.16 {d28, d29, d30, d31}, [DATA, :128]
-
- vpop {d8-d15}
- bx lr
-
- .unreq DATA
- .unreq TMP
-
-
-/*****************************************************************************/
-
-/*
* GLOBAL(JOCTET*)
* jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
* JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index db250735..d94cfddf 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -86,15 +86,6 @@ Ljsimd_fdct_islow_neon_consts:
#undef F_2_562
#undef F_3_072
-/* Constants for jsimd_fdct_ifast_neon() */
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
/* Constants for jsimd_huff_encode_one_block_neon() */
.balign 16
@@ -495,110 +486,6 @@ asm_function jsimd_fdct_islow_neon
/*****************************************************************************/
/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433 v0.h[0]
-#define XFIX_0_541196100 v0.h[1]
-#define XFIX_0_707106781 v0.h[2]
-#define XFIX_1_306562965 v0.h[3]
-
-.balign 16
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req x0
- TMP .req x9
-
- /* Load constants */
- get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts
- ld1 {v0.4h}, [TMP]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | v0.8h
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
- mov TMP, #2
- sub DATA, DATA, #64
-1:
- /* Transpose */
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
- subs TMP, TMP, #1
- /* 1-D FDCT */
- add v4.8h, v19.8h, v20.8h
- sub v20.8h, v19.8h, v20.8h
- sub v28.8h, v18.8h, v21.8h
- add v18.8h, v18.8h, v21.8h
- sub v29.8h, v17.8h, v22.8h
- add v17.8h, v17.8h, v22.8h
- sub v21.8h, v16.8h, v23.8h
- add v16.8h, v16.8h, v23.8h
- sub v6.8h, v17.8h, v18.8h
- sub v7.8h, v16.8h, v4.8h
- add v5.8h, v17.8h, v18.8h
- add v6.8h, v6.8h, v7.8h
- add v4.8h, v16.8h, v4.8h
- sqdmulh v6.8h, v6.8h, XFIX_0_707106781
- add v19.8h, v20.8h, v28.8h
- add v16.8h, v4.8h, v5.8h
- sub v20.8h, v4.8h, v5.8h
- add v5.8h, v28.8h, v29.8h
- add v29.8h, v29.8h, v21.8h
- sqdmulh v5.8h, v5.8h, XFIX_0_707106781
- sub v28.8h, v19.8h, v29.8h
- add v18.8h, v7.8h, v6.8h
- sqdmulh v28.8h, v28.8h, XFIX_0_382683433
- sub v22.8h, v7.8h, v6.8h
- sqdmulh v19.8h, v19.8h, XFIX_0_541196100
- sqdmulh v7.8h, v29.8h, XFIX_1_306562965
- add v6.8h, v21.8h, v5.8h
- sub v5.8h, v21.8h, v5.8h
- add v29.8h, v29.8h, v28.8h
- add v19.8h, v19.8h, v28.8h
- add v29.8h, v29.8h, v7.8h
- add v21.8h, v5.8h, v19.8h
- sub v19.8h, v5.8h, v19.8h
- add v17.8h, v6.8h, v29.8h
- sub v23.8h, v6.8h, v29.8h
-
- b.ne 1b
-
- /* store results */
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
- br x30
-
- .unreq DATA
- .unreq TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
* GLOBAL(JOCTET *)
* jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
* JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/common/jfdctfst-neon.c b/simd/arm/common/jfdctfst-neon.c
new file mode 100644
index 00000000..e7b2e964
--- /dev/null
+++ b/simd/arm/common/jfdctfst-neon.c
@@ -0,0 +1,211 @@
+/*
+ * jfdctfst-neon.c - fast DCT (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * 'jsimd_fdct_ifast_neon' performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * 'jpeg_fdct_ifast' function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.382683433 = 12544 * 2^-15
+ * 0.541196100 = 17795 * 2^-15
+ * 0.707106781 = 23168 * 2^-15
+ * 0.306562965 = 9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in 'jsimd_fdct_ifast_neon' match up
+ * with those in 'jpeg_fdct_ifast'.
+ */
+
+#define F_0_382 12544
+#define F_0_541 17792
+#define F_0_707 23168
+#define F_0_306 9984
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+ F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
+ /* are used followed by vuzp to transpose the block such that we have a */
+ /* column of samples per vector - allowing all rows to be processed at */
+ /* once. */
+ int16x8x4_t data1 = vld4q_s16(data);
+ int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Load DCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+ /* Pass 1: process rows. */
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ col4 = vsubq_s16(tmp10, tmp11);
+
+ int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ col2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ col6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+ col5 = vaddq_s16(z13, z2); /* phase 6 */
+ col3 = vsubq_s16(z13, z2);
+ col1 = vaddq_s16(z11, z4);
+ col7 = vsubq_s16(z11, z4);
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ row4 = vsubq_s16(tmp10, tmp11);
+
+ z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ row2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ row6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ z13 = vsubq_s16(tmp7, z3);
+
+ row5 = vaddq_s16(z13, z2); /* phase 6 */
+ row3 = vsubq_s16(z13, z2);
+ row1 = vaddq_s16(z11, z4);
+ row7 = vsubq_s16(z11, z4);
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}