From 8ca19625de302b0af290e3e0cdeff24a1c272d39 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 1 Jul 2020 19:48:29 +0100 Subject: Implement fast DCT using Arm NEON intrinsics Adds an Arm NEON intrinsics implementation of the 'fast' not-so- accurate Discrete Cosine Transform (DCT). Removes the NEON assembly implementations for both AArch32 and AArch64. Bug: 922430 Change-Id: I9f37cd82dee36e4e1d237a031e591bac339e9239 --- simd/arm/arm/jsimd_neon.S | 123 ----------------------- simd/arm/arm64/jsimd_neon.S | 113 --------------------- simd/arm/common/jfdctfst-neon.c | 211 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+), 236 deletions(-) create mode 100644 simd/arm/common/jfdctfst-neon.c (limited to 'simd') diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S index 8fce4ee1..2c453246 100644 --- a/simd/arm/arm/jsimd_neon.S +++ b/simd/arm/arm/jsimd_neon.S @@ -62,129 +62,6 @@ _\fname: #define CENTERJSAMPLE 128 -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#define XFIX_0_382683433 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_0_707106781 d0[2] -#define XFIX_1_306562965 d0[3] - -.balign 16 -jsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - -asm_function jsimd_fdct_ifast_neon - - DATA .req r0 - TMP .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP, jsimd_fdct_ifast_neon_consts - vld1.16 {d0}, [TMP, :64] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | q8 - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - vld1.16 {d16, d17, d18, d19}, [DATA, :128]! - vld1.16 {d20, d21, d22, d23}, [DATA, :128]! - vld1.16 {d24, d25, d26, d27}, [DATA, :128]! - vld1.16 {d28, d29, d30, d31}, [DATA, :128] - sub DATA, DATA, #(128 - 32) - - mov TMP, #2 -1: - /* Transpose */ - vtrn.16 q12, q13 - vtrn.16 q10, q11 - vtrn.16 q8, q9 - vtrn.16 q14, q15 - vtrn.32 q9, q11 - vtrn.32 q13, q15 - vtrn.32 q8, q10 - vtrn.32 q12, q14 - vswp d30, d23 - vswp d24, d17 - vswp d26, d19 - /* 1-D FDCT */ - vadd.s16 q2, q11, q12 - vswp d28, d21 - vsub.s16 q12, q11, q12 - vsub.s16 q6, q10, q13 - vadd.s16 q10, q10, q13 - vsub.s16 q7, q9, q14 - vadd.s16 q9, q9, q14 - vsub.s16 q1, q8, q15 - vadd.s16 q8, q8, q15 - vsub.s16 q4, q9, q10 - vsub.s16 q5, q8, q2 - vadd.s16 q3, q9, q10 - vadd.s16 q4, q4, q5 - vadd.s16 q2, q8, q2 - vqdmulh.s16 q4, q4, XFIX_0_707106781 - vadd.s16 q11, q12, q6 - vadd.s16 q8, q2, q3 - vsub.s16 q12, q2, q3 - vadd.s16 q3, q6, q7 - vadd.s16 q7, q7, q1 - vqdmulh.s16 q3, q3, XFIX_0_707106781 - vsub.s16 q6, q11, q7 - vadd.s16 q10, q5, q4 - vqdmulh.s16 q6, q6, XFIX_0_382683433 - vsub.s16 q14, q5, q4 - vqdmulh.s16 q11, q11, XFIX_0_541196100 - vqdmulh.s16 q5, q7, XFIX_1_306562965 - vadd.s16 q4, q1, q3 - vsub.s16 q3, q1, q3 - vadd.s16 q7, q7, q6 - vadd.s16 q11, q11, q6 - vadd.s16 q7, q7, q5 - vadd.s16 q13, q3, q11 - vsub.s16 q11, q3, q11 - vadd.s16 q9, q4, q7 - vsub.s16 q15, q4, q7 - subs TMP, TMP, #1 - bne 1b - - /* store results */ - vst1.16 {d16, d17, d18, d19}, [DATA, :128]! - vst1.16 {d20, d21, d22, d23}, [DATA, :128]! - vst1.16 {d24, d25, d26, d27}, [DATA, :128]! - vst1.16 {d28, d29, d30, d31}, [DATA, :128] - - vpop {d8-d15} - bx lr - - .unreq DATA - .unreq TMP - - /*****************************************************************************/ /* diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index db250735..d94cfddf 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -86,15 +86,6 @@ Ljsimd_fdct_islow_neon_consts: #undef F_2_562 #undef F_3_072 -/* Constants for jsimd_fdct_ifast_neon() */ - -.balign 16 -Ljsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - /* Constants for jsimd_huff_encode_one_block_neon() */ .balign 16 @@ -492,110 +483,6 @@ asm_function jsimd_fdct_islow_neon #undef XFIX_P_3_072 -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#undef XFIX_0_541196100 -#define XFIX_0_382683433 v0.h[0] -#define XFIX_0_541196100 v0.h[1] -#define XFIX_0_707106781 v0.h[2] -#define XFIX_1_306562965 v0.h[3] - -.balign 16 -asm_function jsimd_fdct_ifast_neon - - DATA .req x0 - TMP .req x9 - - /* Load constants */ - get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts - ld1 {v0.4h}, [TMP] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | v0.8h - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - mov TMP, #2 - sub DATA, DATA, #64 -1: - /* Transpose */ - transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 - subs TMP, TMP, #1 - /* 1-D FDCT */ - add v4.8h, v19.8h, v20.8h - sub v20.8h, v19.8h, v20.8h - sub v28.8h, v18.8h, v21.8h - add v18.8h, v18.8h, v21.8h - sub v29.8h, v17.8h, v22.8h - add v17.8h, v17.8h, v22.8h - sub v21.8h, v16.8h, v23.8h - add v16.8h, v16.8h, v23.8h - sub v6.8h, v17.8h, v18.8h - sub v7.8h, v16.8h, v4.8h - add v5.8h, v17.8h, v18.8h - add v6.8h, v6.8h, v7.8h - add v4.8h, v16.8h, v4.8h - sqdmulh v6.8h, v6.8h, XFIX_0_707106781 - add v19.8h, v20.8h, v28.8h - add v16.8h, v4.8h, v5.8h - sub v20.8h, v4.8h, v5.8h - add v5.8h, v28.8h, v29.8h - add v29.8h, v29.8h, v21.8h - sqdmulh v5.8h, v5.8h, XFIX_0_707106781 - sub v28.8h, v19.8h, v29.8h - add v18.8h, v7.8h, v6.8h - sqdmulh v28.8h, v28.8h, XFIX_0_382683433 - sub v22.8h, v7.8h, v6.8h - sqdmulh v19.8h, v19.8h, XFIX_0_541196100 - sqdmulh v7.8h, v29.8h, XFIX_1_306562965 - add v6.8h, v21.8h, v5.8h - sub v5.8h, v21.8h, v5.8h - add v29.8h, v29.8h, v28.8h - add v19.8h, v19.8h, v28.8h - add v29.8h, v29.8h, v7.8h - add v21.8h, v5.8h, v19.8h - sub v19.8h, v5.8h, v19.8h - add v17.8h, v6.8h, v29.8h - sub v23.8h, v6.8h, v29.8h - - b.ne 1b - - /* store results */ - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - - br x30 - - .unreq DATA - .unreq TMP -#undef XFIX_0_382683433 -#undef XFIX_0_541196100 -#undef XFIX_0_707106781 -#undef XFIX_1_306562965 - - /*****************************************************************************/ /* diff --git a/simd/arm/common/jfdctfst-neon.c b/simd/arm/common/jfdctfst-neon.c new file mode 100644 index 00000000..e7b2e964 --- /dev/null +++ b/simd/arm/common/jfdctfst-neon.c @@ -0,0 +1,211 @@ +/* + * jfdctfst-neon.c - fast DCT (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include + +/* + * 'jsimd_fdct_ifast_neon' performs a fast, not so accurate forward DCT + * (Discrete Cosine Transform) on one block of samples. It uses the same + * calculations and produces exactly the same output as IJG's original + * 'jpeg_fdct_ifast' function, which can be found in jfdctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.382683433 = 12544 * 2^-15 + * 0.541196100 = 17795 * 2^-15 + * 0.707106781 = 23168 * 2^-15 + * 0.306562965 = 9984 * 2^-15 + * + * See jfdctfst.c for further details of the IDCT algorithm. Where possible, + * the variable names and comments here in 'jsimd_fdct_ifast_neon' match up + * with those in 'jpeg_fdct_ifast'. + */ + +#define F_0_382 12544 +#define F_0_541 17792 +#define F_0_707 23168 +#define F_0_306 9984 + +ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = { + F_0_382, F_0_541, F_0_707, F_0_306 +}; + +void jsimd_fdct_ifast_neon(DCTELEM *data) +{ + /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */ + /* are used followed by vuzp to transpose the block such that we have a */ + /* column of samples per vector - allowing all rows to be processed at */ + /* once. */ + int16x8x4_t data1 = vld4q_s16(data); + int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE); + + int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]); + int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]); + int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]); + int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]); + + int16x8_t col0 = cols_04.val[0]; + int16x8_t col1 = cols_15.val[0]; + int16x8_t col2 = cols_26.val[0]; + int16x8_t col3 = cols_37.val[0]; + int16x8_t col4 = cols_04.val[1]; + int16x8_t col5 = cols_15.val[1]; + int16x8_t col6 = cols_26.val[1]; + int16x8_t col7 = cols_37.val[1]; + + /* Load DCT conversion constants. */ + const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts); + + /* Pass 1: process rows. */ + int16x8_t tmp0 = vaddq_s16(col0, col7); + int16x8_t tmp7 = vsubq_s16(col0, col7); + int16x8_t tmp1 = vaddq_s16(col1, col6); + int16x8_t tmp6 = vsubq_s16(col1, col6); + int16x8_t tmp2 = vaddq_s16(col2, col5); + int16x8_t tmp5 = vsubq_s16(col2, col5); + int16x8_t tmp3 = vaddq_s16(col3, col4); + int16x8_t tmp4 = vsubq_s16(col3, col4); + + /* Even part */ + int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); + int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); + int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); + + col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + col4 = vsubq_s16(tmp10, tmp11); + + int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + col2 = vaddq_s16(tmp13, z1); /* phase 5 */ + col6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + int16x8_t z13 = vsubq_s16(tmp7, z3); + + col5 = vaddq_s16(z13, z2); /* phase 6 */ + col3 = vsubq_s16(z13, z2); + col1 = vaddq_s16(z11, z4); + col7 = vsubq_s16(z11, z4); + + /* Transpose to work on columns in pass 2. */ + int16x8x2_t cols_01 = vtrnq_s16(col0, col1); + int16x8x2_t cols_23 = vtrnq_s16(col2, col3); + int16x8x2_t cols_45 = vtrnq_s16(col4, col5); + int16x8x2_t cols_67 = vtrnq_s16(col6, col7); + + int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), + vreinterpretq_s32_s16(cols_45.val[0])); + int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), + vreinterpretq_s32_s16(cols_45.val[1])); + int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), + vreinterpretq_s32_s16(cols_67.val[0])); + int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), + vreinterpretq_s32_s16(cols_67.val[1])); + + int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); + int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); + int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); + int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); + + int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); + int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); + int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); + int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); + int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); + int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); + int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); + int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); + + /* Pass 2: process columns. */ + tmp0 = vaddq_s16(row0, row7); + tmp7 = vsubq_s16(row0, row7); + tmp1 = vaddq_s16(row1, row6); + tmp6 = vsubq_s16(row1, row6); + tmp2 = vaddq_s16(row2, row5); + tmp5 = vsubq_s16(row2, row5); + tmp3 = vaddq_s16(row3, row4); + tmp4 = vsubq_s16(row3, row4); + + /* Even part */ + tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + tmp13 = vsubq_s16(tmp0, tmp3); + tmp11 = vaddq_s16(tmp1, tmp2); + tmp12 = vsubq_s16(tmp1, tmp2); + + row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + row4 = vsubq_s16(tmp10, tmp11); + + z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + row2 = vaddq_s16(tmp13, z1); /* phase 5 */ + row6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + z13 = vsubq_s16(tmp7, z3); + + row5 = vaddq_s16(z13, z2); /* phase 6 */ + row3 = vsubq_s16(z13, z2); + row1 = vaddq_s16(z11, z4); + row7 = vsubq_s16(z11, z4); + + vst1q_s16(data + 0 * DCTSIZE, row0); + vst1q_s16(data + 1 * DCTSIZE, row1); + vst1q_s16(data + 2 * DCTSIZE, row2); + vst1q_s16(data + 3 * DCTSIZE, row3); + vst1q_s16(data + 4 * DCTSIZE, row4); + vst1q_s16(data + 5 * DCTSIZE, row5); + vst1q_s16(data + 6 * DCTSIZE, row6); + vst1q_s16(data + 7 * DCTSIZE, row7); +} -- cgit v1.2.3