diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-04-04 22:17:52 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-04-04 22:17:52 +0000 |
commit | ad0eb6a76e32cae294819880ee07105c158982a1 (patch) | |
tree | 1dd29cedbba6b704f1dc6447d964096541a500f7 /libvpx/vpx_dsp | |
parent | 79e287e28ce7149c4d059225c8f3ba9b3304c972 (diff) | |
parent | c1d09c0300c54c9f0c78efb6d82a83f1fcd8af56 (diff) | |
download | libvpx-aml_sdk_331812000.tar.gz |
Snap for 9883729 from c1d09c0300c54c9f0c78efb6d82a83f1fcd8af56 to mainline-sdkext-releaseaml_sdk_331812000aml_sdk_331811000android13-mainline-sdkext-release
Change-Id: Ifb36af904bfbcd93f375eee1b6c54a2d25579953
Diffstat (limited to 'libvpx/vpx_dsp')
66 files changed, 10270 insertions, 3163 deletions
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c index 67f43246a..a458ecaa4 100644 --- a/libvpx/vpx_dsp/arm/fdct16x16_neon.c +++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -35,22 +35,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp3[16]; // Left half. - load(input, stride, temp0); - cross_input(temp0, temp1, 0); - vpx_fdct16x16_body(temp1, temp0); + load_cross(input, stride, temp0); + scale_input(temp0, temp1); + vpx_fdct8x16_body(temp1, temp0); // Right half. - load(input + 8, stride, temp1); - cross_input(temp1, temp2, 0); - vpx_fdct16x16_body(temp2, temp1); + load_cross(input + 8, stride, temp1); + scale_input(temp1, temp2); + vpx_fdct8x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_8x8(&temp0[0], &temp2[0]); - transpose_8x8(&temp1[0], &temp2[8]); + + transpose_s16_8x8_new(&temp0[0], &temp2[0]); + transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); - cross_input(temp2, temp3, 1); - vpx_fdct16x16_body(temp3, temp2); + cross_input(temp2, temp3); + vpx_fdct8x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -61,12 +62,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_8x8(&temp0[8], &temp1[0]); + transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); - cross_input(temp1, temp0, 1); - vpx_fdct16x16_body(temp0, temp1); + cross_input(temp1, temp0); + vpx_fdct8x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], @@ -74,5 +76,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp1); store(output + 8, temp1 + 8); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[16]; + int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16], + right3[16], right4[16]; + + // Left half. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // right half. + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + + highbd_partial_round_shift(left3, right3); + highbd_cross_input(left3, right3, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + + highbd_partial_round_shift(left4, right4); + highbd_cross_input(left4, right4, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + store16_s32(output, left3); + output += 4; + store16_s32(output, right3); + output += 4; + + store16_s32(output, left4); + output += 4; + store16_s32(output, right4); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/libvpx/vpx_dsp/arm/fdct16x16_neon.h index 0dd21153f..43d820b6b 100644 --- a/libvpx/vpx_dsp/arm/fdct16x16_neon.h +++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.h @@ -13,6 +13,8 @@ #include <arm_neon.h> +#include "fdct_neon.h" + static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { b[0] = vld1q_s16(a); a += stride; @@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { // To maybe reduce register usage this could be combined with the load() step to // get the first 4 and last 4 values, cross those, then load the middle 8 values // and cross them. +static INLINE void scale_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vshlq_n_s16(a[0], 2); + b[1] = vshlq_n_s16(a[1], 2); + b[2] = vshlq_n_s16(a[2], 2); + b[3] = vshlq_n_s16(a[3], 2); + b[4] = vshlq_n_s16(a[4], 2); + b[5] = vshlq_n_s16(a[5], 2); + b[6] = vshlq_n_s16(a[6], 2); + b[7] = vshlq_n_s16(a[7], 2); + + b[8] = vshlq_n_s16(a[8], 2); + b[9] = vshlq_n_s16(a[9], 2); + b[10] = vshlq_n_s16(a[10], 2); + b[11] = vshlq_n_s16(a[11], 2); + b[12] = vshlq_n_s16(a[12], 2); + b[13] = vshlq_n_s16(a[13], 2); + b[14] = vshlq_n_s16(a[14], 2); + b[15] = vshlq_n_s16(a[15], 2); +} + static INLINE void cross_input(const int16x8_t *a /*[16]*/, - int16x8_t *b /*[16]*/, const int pass) { - if (pass == 0) { - b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); - b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); - b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); - b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); - b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); - b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); - b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); - b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); - - b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); - b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); - b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); - b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); - b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); - b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); - b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); - b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); - } else { - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - } + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); +} + +static INLINE void load_cross(const int16_t *a, int stride, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + + b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); } // Quarter round at the beginning of the second pass. Can't use vrshr (rounding) @@ -135,84 +159,9 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); } -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t c, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t c0, - const tran_coef_t c1, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, - int16x8_t *b /*[8]*/) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - // Main body of fdct16x16. -static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { int16x8_t s[8]; int16x8_t x[4]; int16x8_t step[8]; @@ -237,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); // Stage 3 x[0] = vaddq_s16(s[4], s[5]); @@ -255,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, x[3] = vaddq_s16(s[7], s[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -272,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); // step 3 s[0] = vaddq_s16(in[8], s[3]); @@ -286,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, s[7] = vaddq_s16(in[15], s[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); // step 5 step[0] = vaddq_s16(s[0], s[1]); @@ -305,23 +257,368 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, step[7] = vaddq_s16(s[7], s[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], &out[11]); } +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, + int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/, + int32x4_t *a_right /*[16]*/, + int32x4_t *b_left /*[16]*/, + int32x4_t *b_right /*[16]*/) { + b_left[0] = vaddq_s32(a_left[0], a_left[15]); + b_left[1] = vaddq_s32(a_left[1], a_left[14]); + b_left[2] = vaddq_s32(a_left[2], a_left[13]); + b_left[3] = vaddq_s32(a_left[3], a_left[12]); + b_left[4] = vaddq_s32(a_left[4], a_left[11]); + b_left[5] = vaddq_s32(a_left[5], a_left[10]); + b_left[6] = vaddq_s32(a_left[6], a_left[9]); + b_left[7] = vaddq_s32(a_left[7], a_left[8]); + + b_right[0] = vaddq_s32(a_right[0], a_right[15]); + b_right[1] = vaddq_s32(a_right[1], a_right[14]); + b_right[2] = vaddq_s32(a_right[2], a_right[13]); + b_right[3] = vaddq_s32(a_right[3], a_right[12]); + b_right[4] = vaddq_s32(a_right[4], a_right[11]); + b_right[5] = vaddq_s32(a_right[5], a_right[10]); + b_right[6] = vaddq_s32(a_right[6], a_right[9]); + b_right[7] = vaddq_s32(a_right[7], a_right[8]); + + b_left[8] = vsubq_s32(a_left[7], a_left[8]); + b_left[9] = vsubq_s32(a_left[6], a_left[9]); + b_left[10] = vsubq_s32(a_left[5], a_left[10]); + b_left[11] = vsubq_s32(a_left[4], a_left[11]); + b_left[12] = vsubq_s32(a_left[3], a_left[12]); + b_left[13] = vsubq_s32(a_left[2], a_left[13]); + b_left[14] = vsubq_s32(a_left[1], a_left[14]); + b_left[15] = vsubq_s32(a_left[0], a_left[15]); + + b_right[8] = vsubq_s32(a_right[7], a_right[8]); + b_right[9] = vsubq_s32(a_right[6], a_right[9]); + b_right[10] = vsubq_s32(a_right[5], a_right[10]); + b_right[11] = vsubq_s32(a_right[4], a_right[11]); + b_right[12] = vsubq_s32(a_right[3], a_right[12]); + b_right[13] = vsubq_s32(a_right[2], a_right[13]); + b_right[14] = vsubq_s32(a_right[1], a_right[14]); + b_right[15] = vsubq_s32(a_right[0], a_right[15]); +} + +static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + const int32x4_t one = vdupq_n_s32(1); + left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); + left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); + left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); + left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); + left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); + left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); + left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); + left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); + left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); + left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); + left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); + left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); + left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); + left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); + left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); + left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); + + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); +} + +// Store 16 32x4 vectors, assuming stride == 16. +static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { + vst1q_s32(a, b[0]); + a += 16; + vst1q_s32(a, b[1]); + a += 16; + vst1q_s32(a, b[2]); + a += 16; + vst1q_s32(a, b[3]); + a += 16; + vst1q_s32(a, b[4]); + a += 16; + vst1q_s32(a, b[5]); + a += 16; + vst1q_s32(a, b[6]); + a += 16; + vst1q_s32(a, b[7]); + a += 16; + vst1q_s32(a, b[8]); + a += 16; + vst1q_s32(a, b[9]); + a += 16; + vst1q_s32(a, b[10]); + a += 16; + vst1q_s32(a, b[11]); + a += 16; + vst1q_s32(a, b[12]); + a += 16; + vst1q_s32(a, b[13]); + a += 16; + vst1q_s32(a, b[14]); + a += 16; + vst1q_s32(a, b[15]); +} + +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c index de74e6630..d6818d2ec 100644 --- a/libvpx/vpx_dsp/arm/fdct32x32_neon.c +++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -15,6 +15,8 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" // Most gcc 4.9 distributions outside of Android do not generate correct code // for this function. @@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, #else -#define LOAD_INCREMENT(src, stride, dest, index) \ - do { \ - dest[index] = vld1q_s16(src); \ - src += stride; \ - } while (0) - -#define ADD_S16(src, index0, index1, dest, index3) \ - do { \ - dest[index3] = vaddq_s16(src[index0], src[index1]); \ - } while (0) - -#define ADD_SHIFT_S16(src, index0, index1) \ - do { \ - src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ - } while (0) - -// Load, cross, and multiply by 4. Load the first 8 and last 8, then the -// middle -// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? -static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { - const int16_t *a_end = a + 24 * stride; - int16x8_t c[8]; - - LOAD_INCREMENT(a, stride, b, 0); - LOAD_INCREMENT(a, stride, b, 1); - LOAD_INCREMENT(a, stride, b, 2); - LOAD_INCREMENT(a, stride, b, 3); - LOAD_INCREMENT(a, stride, b, 4); - LOAD_INCREMENT(a, stride, b, 5); - LOAD_INCREMENT(a, stride, b, 6); - LOAD_INCREMENT(a, stride, b, 7); - - LOAD_INCREMENT(a_end, stride, b, 24); - LOAD_INCREMENT(a_end, stride, b, 25); - LOAD_INCREMENT(a_end, stride, b, 26); - LOAD_INCREMENT(a_end, stride, b, 27); - LOAD_INCREMENT(a_end, stride, b, 28); - LOAD_INCREMENT(a_end, stride, b, 29); - LOAD_INCREMENT(a_end, stride, b, 30); - LOAD_INCREMENT(a_end, stride, b, 31); - - ADD_S16(b, 0, 31, c, 0); - ADD_S16(b, 1, 30, c, 1); - ADD_S16(b, 2, 29, c, 2); - ADD_S16(b, 3, 28, c, 3); - ADD_S16(b, 4, 27, c, 4); - ADD_S16(b, 5, 26, c, 5); - ADD_S16(b, 6, 25, c, 6); - ADD_S16(b, 7, 24, c, 7); - - ADD_SHIFT_S16(b, 7, 24); - ADD_SHIFT_S16(b, 6, 25); - ADD_SHIFT_S16(b, 5, 26); - ADD_SHIFT_S16(b, 4, 27); - ADD_SHIFT_S16(b, 3, 28); - ADD_SHIFT_S16(b, 2, 29); - ADD_SHIFT_S16(b, 1, 30); - ADD_SHIFT_S16(b, 0, 31); - - b[0] = vshlq_n_s16(c[0], 2); - b[1] = vshlq_n_s16(c[1], 2); - b[2] = vshlq_n_s16(c[2], 2); - b[3] = vshlq_n_s16(c[3], 2); - b[4] = vshlq_n_s16(c[4], 2); - b[5] = vshlq_n_s16(c[5], 2); - b[6] = vshlq_n_s16(c[6], 2); - b[7] = vshlq_n_s16(c[7], 2); - - LOAD_INCREMENT(a, stride, b, 8); - LOAD_INCREMENT(a, stride, b, 9); - LOAD_INCREMENT(a, stride, b, 10); - LOAD_INCREMENT(a, stride, b, 11); - LOAD_INCREMENT(a, stride, b, 12); - LOAD_INCREMENT(a, stride, b, 13); - LOAD_INCREMENT(a, stride, b, 14); - LOAD_INCREMENT(a, stride, b, 15); - LOAD_INCREMENT(a, stride, b, 16); - LOAD_INCREMENT(a, stride, b, 17); - LOAD_INCREMENT(a, stride, b, 18); - LOAD_INCREMENT(a, stride, b, 19); - LOAD_INCREMENT(a, stride, b, 20); - LOAD_INCREMENT(a, stride, b, 21); - LOAD_INCREMENT(a, stride, b, 22); - LOAD_INCREMENT(a, stride, b, 23); - - ADD_S16(b, 8, 23, c, 0); - ADD_S16(b, 9, 22, c, 1); - ADD_S16(b, 10, 21, c, 2); - ADD_S16(b, 11, 20, c, 3); - ADD_S16(b, 12, 19, c, 4); - ADD_S16(b, 13, 18, c, 5); - ADD_S16(b, 14, 17, c, 6); - ADD_S16(b, 15, 16, c, 7); - - ADD_SHIFT_S16(b, 15, 16); - ADD_SHIFT_S16(b, 14, 17); - ADD_SHIFT_S16(b, 13, 18); - ADD_SHIFT_S16(b, 12, 19); - ADD_SHIFT_S16(b, 11, 20); - ADD_SHIFT_S16(b, 10, 21); - ADD_SHIFT_S16(b, 9, 22); - ADD_SHIFT_S16(b, 8, 23); - - b[8] = vshlq_n_s16(c[0], 2); - b[9] = vshlq_n_s16(c[1], 2); - b[10] = vshlq_n_s16(c[2], 2); - b[11] = vshlq_n_s16(c[3], 2); - b[12] = vshlq_n_s16(c[4], 2); - b[13] = vshlq_n_s16(c[5], 2); - b[14] = vshlq_n_s16(c[6], 2); - b[15] = vshlq_n_s16(c[7], 2); -} - -#undef LOAD_INCREMENT -#undef ADD_S16 -#undef ADD_SHIFT_S16 - -#define STORE_S16(src, index, dest) \ - do { \ - store_s16q_to_tran_low(dest, src[index]); \ - dest += 8; \ - } while (0) - -// Store 32 16x8 values, assuming stride == 32. -// Slight twist: store horizontally in blocks of 8. -static INLINE void store(tran_low_t *a, const int16x8_t *b) { - STORE_S16(b, 0, a); - STORE_S16(b, 8, a); - STORE_S16(b, 16, a); - STORE_S16(b, 24, a); - STORE_S16(b, 1, a); - STORE_S16(b, 9, a); - STORE_S16(b, 17, a); - STORE_S16(b, 25, a); - STORE_S16(b, 2, a); - STORE_S16(b, 10, a); - STORE_S16(b, 18, a); - STORE_S16(b, 26, a); - STORE_S16(b, 3, a); - STORE_S16(b, 11, a); - STORE_S16(b, 19, a); - STORE_S16(b, 27, a); - STORE_S16(b, 4, a); - STORE_S16(b, 12, a); - STORE_S16(b, 20, a); - STORE_S16(b, 28, a); - STORE_S16(b, 5, a); - STORE_S16(b, 13, a); - STORE_S16(b, 21, a); - STORE_S16(b, 29, a); - STORE_S16(b, 6, a); - STORE_S16(b, 14, a); - STORE_S16(b, 22, a); - STORE_S16(b, 30, a); - STORE_S16(b, 7, a); - STORE_S16(b, 15, a); - STORE_S16(b, 23, a); - STORE_S16(b, 31, a); -} - -#undef STORE_S16 - -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t constant, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t constant0, - const tran_coef_t constant1, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Add 2 if positive, 1 if negative, and shift by 2. -// In practice, subtract the sign bit, then shift with rounding. -static INLINE int16x8_t sub_round_shift(const int16x8_t a) { - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); -} - -static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1: Done as part of the load. - - // Stage 2. - // Mini cross. X the first 16 values and the middle 8 of the second half. - a[0] = vaddq_s16(in[0], in[15]); - a[1] = vaddq_s16(in[1], in[14]); - a[2] = vaddq_s16(in[2], in[13]); - a[3] = vaddq_s16(in[3], in[12]); - a[4] = vaddq_s16(in[4], in[11]); - a[5] = vaddq_s16(in[5], in[10]); - a[6] = vaddq_s16(in[6], in[9]); - a[7] = vaddq_s16(in[7], in[8]); - - a[8] = vsubq_s16(in[7], in[8]); - a[9] = vsubq_s16(in[6], in[9]); - a[10] = vsubq_s16(in[5], in[10]); - a[11] = vsubq_s16(in[4], in[11]); - a[12] = vsubq_s16(in[3], in[12]); - a[13] = vsubq_s16(in[2], in[13]); - a[14] = vsubq_s16(in[1], in[14]); - a[15] = vsubq_s16(in[0], in[15]); - - a[16] = in[16]; - a[17] = in[17]; - a[18] = in[18]; - a[19] = in[19]; - - butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); - butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); - butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); - butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); - - a[28] = in[28]; - a[29] = in[29]; - a[30] = in[30]; - a[31] = in[31]; - - // Stage 3. - b[0] = vaddq_s16(a[0], a[7]); - b[1] = vaddq_s16(a[1], a[6]); - b[2] = vaddq_s16(a[2], a[5]); - b[3] = vaddq_s16(a[3], a[4]); - - b[4] = vsubq_s16(a[3], a[4]); - b[5] = vsubq_s16(a[2], a[5]); - b[6] = vsubq_s16(a[1], a[6]); - b[7] = vsubq_s16(a[0], a[7]); - - b[8] = a[8]; - b[9] = a[9]; - - butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); - butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); - - b[14] = a[14]; - b[15] = a[15]; - - b[16] = vaddq_s16(in[16], a[23]); - b[17] = vaddq_s16(in[17], a[22]); - b[18] = vaddq_s16(in[18], a[21]); - b[19] = vaddq_s16(in[19], a[20]); - - b[20] = vsubq_s16(in[19], a[20]); - b[21] = vsubq_s16(in[18], a[21]); - b[22] = vsubq_s16(in[17], a[22]); - b[23] = vsubq_s16(in[16], a[23]); - - b[24] = vsubq_s16(in[31], a[24]); - b[25] = vsubq_s16(in[30], a[25]); - b[26] = vsubq_s16(in[29], a[26]); - b[27] = vsubq_s16(in[28], a[27]); - - b[28] = vaddq_s16(in[28], a[27]); - b[29] = vaddq_s16(in[29], a[26]); - b[30] = vaddq_s16(in[30], a[25]); - b[31] = vaddq_s16(in[31], a[24]); - - // Stage 4. - a[0] = vaddq_s16(b[0], b[3]); - a[1] = vaddq_s16(b[1], b[2]); - a[2] = vsubq_s16(b[1], b[2]); - a[3] = vsubq_s16(b[0], b[3]); - - a[4] = b[4]; - - butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); - - a[7] = b[7]; - - a[8] = vaddq_s16(b[8], b[11]); - a[9] = vaddq_s16(b[9], b[10]); - a[10] = vsubq_s16(b[9], b[10]); - a[11] = vsubq_s16(b[8], b[11]); - a[12] = vsubq_s16(b[15], b[12]); - a[13] = vsubq_s16(b[14], b[13]); - a[14] = vaddq_s16(b[14], b[13]); - a[15] = vaddq_s16(b[15], b[12]); - - a[16] = b[16]; - a[17] = b[17]; - - butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); - butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); - butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); - butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); - - a[22] = b[22]; - a[23] = b[23]; - a[24] = b[24]; - a[25] = b[25]; - - a[30] = b[30]; - a[31] = b[31]; - - // Stage 5. - butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); - butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); - - b[4] = vaddq_s16(a[4], a[5]); - b[5] = vsubq_s16(a[4], a[5]); - b[6] = vsubq_s16(a[7], a[6]); - b[7] = vaddq_s16(a[7], a[6]); - - b[8] = a[8]; - - butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); - butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); - - b[11] = a[11]; - b[12] = a[12]; - - b[15] = a[15]; - - b[16] = vaddq_s16(a[19], a[16]); - b[17] = vaddq_s16(a[18], a[17]); - b[18] = vsubq_s16(a[17], a[18]); - b[19] = vsubq_s16(a[16], a[19]); - b[20] = vsubq_s16(a[23], a[20]); - b[21] = vsubq_s16(a[22], a[21]); - b[22] = vaddq_s16(a[21], a[22]); - b[23] = vaddq_s16(a[20], a[23]); - b[24] = vaddq_s16(a[27], a[24]); - b[25] = vaddq_s16(a[26], a[25]); - b[26] = vsubq_s16(a[25], a[26]); - b[27] = vsubq_s16(a[24], a[27]); - b[28] = vsubq_s16(a[31], a[28]); - b[29] = vsubq_s16(a[30], a[29]); - b[30] = vaddq_s16(a[29], a[30]); - b[31] = vaddq_s16(a[28], a[31]); - - // Stage 6. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - - butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); - butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); - - a[8] = vaddq_s16(b[8], b[9]); - a[9] = vsubq_s16(b[8], b[9]); - a[10] = vsubq_s16(b[11], b[10]); - a[11] = vaddq_s16(b[11], b[10]); - a[12] = vaddq_s16(b[12], b[13]); - a[13] = vsubq_s16(b[12], b[13]); - a[14] = vsubq_s16(b[15], b[14]); - a[15] = vaddq_s16(b[15], b[14]); - - a[16] = b[16]; - a[19] = b[19]; - a[20] = b[20]; - a[23] = b[23]; - a[24] = b[24]; - a[27] = b[27]; - a[28] = b[28]; - a[31] = b[31]; - - butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); - butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); - - butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); - butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); - - // Stage 7. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - b[4] = a[4]; - b[5] = a[5]; - b[6] = a[6]; - b[7] = a[7]; - - butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); - butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); - butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); - butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); - - b[16] = vaddq_s16(a[16], a[17]); - b[17] = vsubq_s16(a[16], a[17]); - b[18] = vsubq_s16(a[19], a[18]); - b[19] = vaddq_s16(a[19], a[18]); - b[20] = vaddq_s16(a[20], a[21]); - b[21] = vsubq_s16(a[20], a[21]); - b[22] = vsubq_s16(a[23], a[22]); - b[23] = vaddq_s16(a[23], a[22]); - b[24] = vaddq_s16(a[24], a[25]); - b[25] = vsubq_s16(a[24], a[25]); - b[26] = vsubq_s16(a[27], a[26]); - b[27] = vaddq_s16(a[27], a[26]); - b[28] = vaddq_s16(a[28], a[29]); - b[29] = vsubq_s16(a[28], a[29]); - b[30] = vsubq_s16(a[31], a[30]); - b[31] = vaddq_s16(a[31], a[30]); - - // Final stage. - // Also compute partial rounding shift: - // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - out[0] = sub_round_shift(b[0]); - out[16] = sub_round_shift(b[1]); - out[8] = sub_round_shift(b[2]); - out[24] = sub_round_shift(b[3]); - out[4] = sub_round_shift(b[4]); - out[20] = sub_round_shift(b[5]); - out[12] = sub_round_shift(b[6]); - out[28] = sub_round_shift(b[7]); - out[2] = sub_round_shift(b[8]); - out[18] = sub_round_shift(b[9]); - out[10] = sub_round_shift(b[10]); - out[26] = sub_round_shift(b[11]); - out[6] = sub_round_shift(b[12]); - out[22] = sub_round_shift(b[13]); - out[14] = sub_round_shift(b[14]); - out[30] = sub_round_shift(b[15]); - - butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); - out[1] = sub_round_shift(a[1]); - out[31] = sub_round_shift(a[31]); - - butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); - out[17] = sub_round_shift(a[17]); - out[15] = sub_round_shift(a[15]); - - butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); - out[9] = sub_round_shift(a[9]); - out[23] = sub_round_shift(a[23]); - - butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); - out[25] = sub_round_shift(a[25]); - out[7] = sub_round_shift(a[7]); - - butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); - out[5] = sub_round_shift(a[5]); - out[27] = sub_round_shift(a[27]); - - butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); - out[21] = sub_round_shift(a[21]); - out[11] = sub_round_shift(a[11]); - - butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); - out[13] = sub_round_shift(a[13]); - out[19] = sub_round_shift(a[19]); - - butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); - out[29] = sub_round_shift(a[29]); - out[3] = sub_round_shift(a[3]); -} - -#define PASS_THROUGH(src, dst, element) \ - do { \ - dst##_lo[element] = src##_lo[element]; \ - dst##_hi[element] = src##_hi[element]; \ - } while (0) - -#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ - do { \ - c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ - c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ - } while (0) - -#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ - do { \ - temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ - temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ - c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ - c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ - } while (0) - -#define ADD_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define SUB_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -// Like butterfly_one_coeff, but don't narrow results. -static INLINE void butterfly_one_coeff_s16_s32( - const int16x8_t a, const int16x8_t b, const tran_high_t constant, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ - add_index, sub_index) \ - do { \ - butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ - &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -// Like butterfly_one_coeff, but with s32. -static INLINE void butterfly_one_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, - int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { - const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); - const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); - const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); - const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ - sub_index) \ - do { \ - butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - constant, &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -// Like butterfly_two_coeff, but with s32. -static INLINE void butterfly_two_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); - const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); - const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); - const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); - const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); - const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); - const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); - const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ - right_constant, b, add_index, sub_index) \ - do { \ - butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - left_constant, right_constant, &b##_lo[add_index], \ - &b##_hi[add_index], &b##_lo[sub_index], \ - &b##_hi[sub_index]); \ - } while (0) - -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, - const int32x4_t a_hi) { - const int32x4_t one = vdupq_n_s32(1); - const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); - const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); - const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); - const int16x4_t b_lo = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); - const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); - const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); - const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); - const int16x4_t b_hi = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); - return vcombine_s16(b_lo, b_hi); -} - -static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - int32x4_t c_lo[32]; - int32x4_t c_hi[32]; - int32x4_t d_lo[32]; - int32x4_t d_hi[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - - b[16] = a[16]; - b[17] = a[17]; - b[18] = a[18]; - b[19] = a[19]; - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - - b[28] = a[28]; - b[29] = a[29]; - b[30] = a[30]; - b[31] = a[31]; - - // Stage 3. With extreme values for input this calculation rolls over int16_t. - // The sources for b[0] get added multiple times and, through testing, have - // been shown to overflow starting here. - ADD_S16_S32(b, 0, 7, c, 0); - ADD_S16_S32(b, 1, 6, c, 1); - ADD_S16_S32(b, 2, 5, c, 2); - ADD_S16_S32(b, 3, 4, c, 3); - SUB_S16_S32(b, 3, 4, c, 4); - SUB_S16_S32(b, 2, 5, c, 5); - SUB_S16_S32(b, 1, 6, c, 6); - SUB_S16_S32(b, 0, 7, c, 7); - - a[8] = b[8]; - a[9] = b[9]; - - BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); - BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); - - a[14] = b[14]; - a[15] = b[15]; - - ADD_S16_S32(b, 16, 23, c, 16); - ADD_S16_S32(b, 17, 22, c, 17); - ADD_S16_S32(b, 18, 21, c, 18); - ADD_S16_S32(b, 19, 20, c, 19); - SUB_S16_S32(b, 19, 20, c, 20); - SUB_S16_S32(b, 18, 21, c, 21); - SUB_S16_S32(b, 17, 22, c, 22); - SUB_S16_S32(b, 16, 23, c, 23); - SUB_S16_S32(b, 31, 24, c, 24); - SUB_S16_S32(b, 30, 25, c, 25); - SUB_S16_S32(b, 29, 26, c, 26); - SUB_S16_S32(b, 28, 27, c, 27); - ADD_S16_S32(b, 28, 27, c, 28); - ADD_S16_S32(b, 29, 26, c, 29); - ADD_S16_S32(b, 30, 25, c, 30); - ADD_S16_S32(b, 31, 24, c, 31); - - // Stage 4. - ADD_S32(c, 0, 3, d, 0); - ADD_S32(c, 1, 2, d, 1); - SUB_S32(c, 1, 2, d, 2); - SUB_S32(c, 0, 3, d, 3); - - PASS_THROUGH(c, d, 4); - - BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); - - PASS_THROUGH(c, d, 7); - - ADDW_S16_S32(c, 11, a, 8, d, 8); - ADDW_S16_S32(c, 10, a, 9, d, 9); - SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); - SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); - SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); - SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); - ADDW_S16_S32(c, 13, b, 14, d, 14); - ADDW_S16_S32(c, 12, b, 15, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 17); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); - BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); - BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); - - PASS_THROUGH(c, d, 22); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 25); - - PASS_THROUGH(c, d, 30); - PASS_THROUGH(c, d, 31); - - // Stage 5. - BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); - BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); - - ADD_S32(d, 4, 5, c, 4); - SUB_S32(d, 4, 5, c, 5); - SUB_S32(d, 7, 6, c, 6); - ADD_S32(d, 7, 6, c, 7); - - PASS_THROUGH(d, c, 8); - - BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); - BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); - - PASS_THROUGH(d, c, 11); - PASS_THROUGH(d, c, 12); - PASS_THROUGH(d, c, 15); - - ADD_S32(d, 16, 19, c, 16); - ADD_S32(d, 17, 18, c, 17); - SUB_S32(d, 17, 18, c, 18); - SUB_S32(d, 16, 19, c, 19); - SUB_S32(d, 23, 20, c, 20); - SUB_S32(d, 22, 21, c, 21); - ADD_S32(d, 22, 21, c, 22); - ADD_S32(d, 23, 20, c, 23); - ADD_S32(d, 24, 27, c, 24); - ADD_S32(d, 25, 26, c, 25); - SUB_S32(d, 25, 26, c, 26); - SUB_S32(d, 24, 27, c, 27); - SUB_S32(d, 31, 28, c, 28); - SUB_S32(d, 30, 29, c, 29); - ADD_S32(d, 30, 29, c, 30); - ADD_S32(d, 31, 28, c, 31); - - // Stage 6. - PASS_THROUGH(c, d, 0); - PASS_THROUGH(c, d, 1); - PASS_THROUGH(c, d, 2); - PASS_THROUGH(c, d, 3); - - BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); - BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); - - ADD_S32(c, 8, 9, d, 8); - SUB_S32(c, 8, 9, d, 9); - SUB_S32(c, 11, 10, d, 10); - ADD_S32(c, 11, 10, d, 11); - ADD_S32(c, 12, 13, d, 12); - SUB_S32(c, 12, 13, d, 13); - SUB_S32(c, 15, 14, d, 14); - ADD_S32(c, 15, 14, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 19); - PASS_THROUGH(c, d, 20); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 27); - PASS_THROUGH(c, d, 28); - PASS_THROUGH(c, d, 31); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); - BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); - BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); - - // Stage 7. - PASS_THROUGH(d, c, 0); - PASS_THROUGH(d, c, 1); - PASS_THROUGH(d, c, 2); - PASS_THROUGH(d, c, 3); - PASS_THROUGH(d, c, 4); - PASS_THROUGH(d, c, 5); - PASS_THROUGH(d, c, 6); - PASS_THROUGH(d, c, 7); - - BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); - BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); - BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); - BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); - - ADD_S32(d, 16, 17, c, 16); - SUB_S32(d, 16, 17, c, 17); - SUB_S32(d, 19, 18, c, 18); - ADD_S32(d, 19, 18, c, 19); - ADD_S32(d, 20, 21, c, 20); - SUB_S32(d, 20, 21, c, 21); - SUB_S32(d, 23, 22, c, 22); - ADD_S32(d, 23, 22, c, 23); - ADD_S32(d, 24, 25, c, 24); - SUB_S32(d, 24, 25, c, 25); - SUB_S32(d, 27, 26, c, 26); - ADD_S32(d, 27, 26, c, 27); - ADD_S32(d, 28, 29, c, 28); - SUB_S32(d, 28, 29, c, 29); - SUB_S32(d, 31, 30, c, 30); - ADD_S32(d, 31, 30, c, 31); - - // Final stage. - // Roll rounding into this function so we can pass back int16x8. - - out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); - out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); - - out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); - out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); - out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); - out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); - out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); - - out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); - out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); - out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); - out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); - - out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); - out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); - out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); - out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); - out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); - - BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); - out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); - out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); - out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); - out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); - out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); - out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); - - BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); - out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); - out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); - - BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); - out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); - out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); - - BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); - out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); - out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); - - BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); - out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); - out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); - - BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); - out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); - out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); -} - -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { - const int16x8_t one = vdupq_n_s16(1); - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); -} - -static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - // For the "rd" version, all the values are rounded down after stage 2 to keep - // the values in 16 bits. - b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); - b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); - b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); - b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); - b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); - b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); - b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); - b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); - - b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); - b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); - b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); - b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); - b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); - b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); - b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); - b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); - - b[16] = add_round_shift_s16(a[16]); - b[17] = add_round_shift_s16(a[17]); - b[18] = add_round_shift_s16(a[18]); - b[19] = add_round_shift_s16(a[19]); - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - b[20] = add_round_shift_s16(b[20]); - b[21] = add_round_shift_s16(b[21]); - b[22] = add_round_shift_s16(b[22]); - b[23] = add_round_shift_s16(b[23]); - b[24] = add_round_shift_s16(b[24]); - b[25] = add_round_shift_s16(b[25]); - b[26] = add_round_shift_s16(b[26]); - b[27] = add_round_shift_s16(b[27]); - - b[28] = add_round_shift_s16(a[28]); - b[29] = add_round_shift_s16(a[29]); - b[30] = add_round_shift_s16(a[30]); - b[31] = add_round_shift_s16(a[31]); - - // Stage 3. - a[0] = vaddq_s16(b[0], b[7]); - a[1] = vaddq_s16(b[1], b[6]); - a[2] = vaddq_s16(b[2], b[5]); - a[3] = vaddq_s16(b[3], b[4]); - - a[4] = vsubq_s16(b[3], b[4]); - a[5] = vsubq_s16(b[2], b[5]); - a[6] = vsubq_s16(b[1], b[6]); - a[7] = vsubq_s16(b[0], b[7]); - - a[8] = b[8]; - a[9] = b[9]; - - butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); - butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); - - a[14] = b[14]; - a[15] = b[15]; - - a[16] = vaddq_s16(b[16], b[23]); - a[17] = vaddq_s16(b[17], b[22]); - a[18] = vaddq_s16(b[18], b[21]); - a[19] = vaddq_s16(b[19], b[20]); - - a[20] = vsubq_s16(b[19], b[20]); - a[21] = vsubq_s16(b[18], b[21]); - a[22] = vsubq_s16(b[17], b[22]); - a[23] = vsubq_s16(b[16], b[23]); - - a[24] = vsubq_s16(b[31], b[24]); - a[25] = vsubq_s16(b[30], b[25]); - a[26] = vsubq_s16(b[29], b[26]); - a[27] = vsubq_s16(b[28], b[27]); - - a[28] = vaddq_s16(b[28], b[27]); - a[29] = vaddq_s16(b[29], b[26]); - a[30] = vaddq_s16(b[30], b[25]); - a[31] = vaddq_s16(b[31], b[24]); - - // Stage 4. - b[0] = vaddq_s16(a[0], a[3]); - b[1] = vaddq_s16(a[1], a[2]); - b[2] = vsubq_s16(a[1], a[2]); - b[3] = vsubq_s16(a[0], a[3]); - - b[4] = a[4]; - - butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); - - b[7] = a[7]; - - b[8] = vaddq_s16(a[8], a[11]); - b[9] = vaddq_s16(a[9], a[10]); - b[10] = vsubq_s16(a[9], a[10]); - b[11] = vsubq_s16(a[8], a[11]); - b[12] = vsubq_s16(a[15], a[12]); - b[13] = vsubq_s16(a[14], a[13]); - b[14] = vaddq_s16(a[14], a[13]); - b[15] = vaddq_s16(a[15], a[12]); - - b[16] = a[16]; - b[17] = a[17]; - - butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); - butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); - butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); - butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); - - b[22] = a[22]; - b[23] = a[23]; - b[24] = a[24]; - b[25] = a[25]; - - b[30] = a[30]; - b[31] = a[31]; - - // Stage 5. - butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); - butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); - - a[4] = vaddq_s16(b[4], b[5]); - a[5] = vsubq_s16(b[4], b[5]); - a[6] = vsubq_s16(b[7], b[6]); - a[7] = vaddq_s16(b[7], b[6]); - - a[8] = b[8]; - - butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); - butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); - - a[11] = b[11]; - a[12] = b[12]; - - a[15] = b[15]; - - a[16] = vaddq_s16(b[19], b[16]); - a[17] = vaddq_s16(b[18], b[17]); - a[18] = vsubq_s16(b[17], b[18]); - a[19] = vsubq_s16(b[16], b[19]); - a[20] = vsubq_s16(b[23], b[20]); - a[21] = vsubq_s16(b[22], b[21]); - a[22] = vaddq_s16(b[21], b[22]); - a[23] = vaddq_s16(b[20], b[23]); - a[24] = vaddq_s16(b[27], b[24]); - a[25] = vaddq_s16(b[26], b[25]); - a[26] = vsubq_s16(b[25], b[26]); - a[27] = vsubq_s16(b[24], b[27]); - a[28] = vsubq_s16(b[31], b[28]); - a[29] = vsubq_s16(b[30], b[29]); - a[30] = vaddq_s16(b[29], b[30]); - a[31] = vaddq_s16(b[28], b[31]); - - // Stage 6. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - - butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); - butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); - - b[8] = vaddq_s16(a[8], a[9]); - b[9] = vsubq_s16(a[8], a[9]); - b[10] = vsubq_s16(a[11], a[10]); - b[11] = vaddq_s16(a[11], a[10]); - b[12] = vaddq_s16(a[12], a[13]); - b[13] = vsubq_s16(a[12], a[13]); - b[14] = vsubq_s16(a[15], a[14]); - b[15] = vaddq_s16(a[15], a[14]); - - b[16] = a[16]; - b[19] = a[19]; - b[20] = a[20]; - b[23] = a[23]; - b[24] = a[24]; - b[27] = a[27]; - b[28] = a[28]; - b[31] = a[31]; - - butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); - butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); - - butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); - butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); - - // Stage 7. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - a[4] = b[4]; - a[5] = b[5]; - a[6] = b[6]; - a[7] = b[7]; - - butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); - butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); - butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); - butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); - - a[16] = vaddq_s16(b[16], b[17]); - a[17] = vsubq_s16(b[16], b[17]); - a[18] = vsubq_s16(b[19], b[18]); - a[19] = vaddq_s16(b[19], b[18]); - a[20] = vaddq_s16(b[20], b[21]); - a[21] = vsubq_s16(b[20], b[21]); - a[22] = vsubq_s16(b[23], b[22]); - a[23] = vaddq_s16(b[23], b[22]); - a[24] = vaddq_s16(b[24], b[25]); - a[25] = vsubq_s16(b[24], b[25]); - a[26] = vsubq_s16(b[27], b[26]); - a[27] = vaddq_s16(b[27], b[26]); - a[28] = vaddq_s16(b[28], b[29]); - a[29] = vsubq_s16(b[28], b[29]); - a[30] = vsubq_s16(b[31], b[30]); - a[31] = vaddq_s16(b[31], b[30]); - - // Final stage. - out[0] = a[0]; - out[16] = a[1]; - out[8] = a[2]; - out[24] = a[3]; - out[4] = a[4]; - out[20] = a[5]; - out[12] = a[6]; - out[28] = a[7]; - out[2] = a[8]; - out[18] = a[9]; - out[10] = a[10]; - out[26] = a[11]; - out[6] = a[12]; - out[22] = a[13]; - out[14] = a[14]; - out[30] = a[15]; - - butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); - butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], - &out[15]); - butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); - butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); - butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); - butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], - &out[11]); - butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], - &out[19]); - butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); -} - -#undef PASS_THROUGH -#undef ADD_S16_S32 -#undef SUB_S16_S32 -#undef ADDW_S16_S32 -#undef SUBW_S16_S32 -#undef ADD_S32 -#undef SUB_S32 -#undef BUTTERFLY_ONE_S16_S32 -#undef BUTTERFLY_ONE_S32 -#undef BUTTERFLY_TWO_S32 - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -// TODO(johannkoenig): share with other fdcts. -static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[32]; int16x8_t temp1[32]; @@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, &temp5[29], &temp5[30], &temp5[31]); store(output + 24 * 32, temp5); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass(left5, right5); + highbd_partial_add_round_shift(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass(left6, right6); + highbd_partial_add_round_shift(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass(left7, right7); + highbd_partial_add_round_shift(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass(left8, right8); + highbd_partial_add_round_shift(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass_rd(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass_rd(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass_rd(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass_rd(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/libvpx/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 000000000..3b9e64c6d --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,2919 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static INLINE void dct_body_second_pass_rd(const int16x8_t *in, + int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#if CONFIG_VP9_HIGHBITDEPTH + +// Store 32 32x4 vectors, assuming stride == 32. +static INLINE void store32x32_s32( + tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/, + const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/, + const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/, + const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) { + int i; + for (i = 0; i < 32; i++) { + vst1q_s32(a, l1[i]); + vst1q_s32(a + 4, r1[i]); + vst1q_s32(a + 8, l2[i]); + vst1q_s32(a + 12, r2[i]); + vst1q_s32(a + 16, l3[i]); + vst1q_s32(a + 20, r3[i]); + vst1q_s32(a + 24, l4[i]); + vst1q_s32(a + 28, r4[i]); + a += 32; + } +} + +static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/, + int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + left[16] = vshll_n_s16(vget_low_s16(a[16]), 2); + left[17] = vshll_n_s16(vget_low_s16(a[17]), 2); + left[18] = vshll_n_s16(vget_low_s16(a[18]), 2); + left[19] = vshll_n_s16(vget_low_s16(a[19]), 2); + left[20] = vshll_n_s16(vget_low_s16(a[20]), 2); + left[21] = vshll_n_s16(vget_low_s16(a[21]), 2); + left[22] = vshll_n_s16(vget_low_s16(a[22]), 2); + left[23] = vshll_n_s16(vget_low_s16(a[23]), 2); + left[24] = vshll_n_s16(vget_low_s16(a[24]), 2); + left[25] = vshll_n_s16(vget_low_s16(a[25]), 2); + left[26] = vshll_n_s16(vget_low_s16(a[26]), 2); + left[27] = vshll_n_s16(vget_low_s16(a[27]), 2); + left[28] = vshll_n_s16(vget_low_s16(a[28]), 2); + left[29] = vshll_n_s16(vget_low_s16(a[29]), 2); + left[30] = vshll_n_s16(vget_low_s16(a[30]), 2); + left[31] = vshll_n_s16(vget_low_s16(a[31]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); + right[16] = vshll_n_s16(vget_high_s16(a[16]), 2); + right[17] = vshll_n_s16(vget_high_s16(a[17]), 2); + right[18] = vshll_n_s16(vget_high_s16(a[18]), 2); + right[19] = vshll_n_s16(vget_high_s16(a[19]), 2); + right[20] = vshll_n_s16(vget_high_s16(a[20]), 2); + right[21] = vshll_n_s16(vget_high_s16(a[21]), 2); + right[22] = vshll_n_s16(vget_high_s16(a[22]), 2); + right[23] = vshll_n_s16(vget_high_s16(a[23]), 2); + right[24] = vshll_n_s16(vget_high_s16(a[24]), 2); + right[25] = vshll_n_s16(vget_high_s16(a[25]), 2); + right[26] = vshll_n_s16(vget_high_s16(a[26]), 2); + right[27] = vshll_n_s16(vget_high_s16(a[27]), 2); + right[28] = vshll_n_s16(vget_high_s16(a[28]), 2); + right[29] = vshll_n_s16(vget_high_s16(a[29]), 2); + right[30] = vshll_n_s16(vget_high_s16(a[30]), 2); + right[31] = vshll_n_s16(vget_high_s16(a[31]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/, + int32x4_t *a_right /*[32]*/, + int32x4_t *b_left /*[32]*/, + int32x4_t *b_right /*[32]*/) { + // Stage 1. Done as part of the load for the first pass. + b_left[0] = vaddq_s32(a_left[0], a_left[31]); + b_left[1] = vaddq_s32(a_left[1], a_left[30]); + b_left[2] = vaddq_s32(a_left[2], a_left[29]); + b_left[3] = vaddq_s32(a_left[3], a_left[28]); + b_left[4] = vaddq_s32(a_left[4], a_left[27]); + b_left[5] = vaddq_s32(a_left[5], a_left[26]); + b_left[6] = vaddq_s32(a_left[6], a_left[25]); + b_left[7] = vaddq_s32(a_left[7], a_left[24]); + b_left[8] = vaddq_s32(a_left[8], a_left[23]); + b_left[9] = vaddq_s32(a_left[9], a_left[22]); + b_left[10] = vaddq_s32(a_left[10], a_left[21]); + b_left[11] = vaddq_s32(a_left[11], a_left[20]); + b_left[12] = vaddq_s32(a_left[12], a_left[19]); + b_left[13] = vaddq_s32(a_left[13], a_left[18]); + b_left[14] = vaddq_s32(a_left[14], a_left[17]); + b_left[15] = vaddq_s32(a_left[15], a_left[16]); + + b_right[0] = vaddq_s32(a_right[0], a_right[31]); + b_right[1] = vaddq_s32(a_right[1], a_right[30]); + b_right[2] = vaddq_s32(a_right[2], a_right[29]); + b_right[3] = vaddq_s32(a_right[3], a_right[28]); + b_right[4] = vaddq_s32(a_right[4], a_right[27]); + b_right[5] = vaddq_s32(a_right[5], a_right[26]); + b_right[6] = vaddq_s32(a_right[6], a_right[25]); + b_right[7] = vaddq_s32(a_right[7], a_right[24]); + b_right[8] = vaddq_s32(a_right[8], a_right[23]); + b_right[9] = vaddq_s32(a_right[9], a_right[22]); + b_right[10] = vaddq_s32(a_right[10], a_right[21]); + b_right[11] = vaddq_s32(a_right[11], a_right[20]); + b_right[12] = vaddq_s32(a_right[12], a_right[19]); + b_right[13] = vaddq_s32(a_right[13], a_right[18]); + b_right[14] = vaddq_s32(a_right[14], a_right[17]); + b_right[15] = vaddq_s32(a_right[15], a_right[16]); + + b_left[16] = vsubq_s32(a_left[15], a_left[16]); + b_left[17] = vsubq_s32(a_left[14], a_left[17]); + b_left[18] = vsubq_s32(a_left[13], a_left[18]); + b_left[19] = vsubq_s32(a_left[12], a_left[19]); + b_left[20] = vsubq_s32(a_left[11], a_left[20]); + b_left[21] = vsubq_s32(a_left[10], a_left[21]); + b_left[22] = vsubq_s32(a_left[9], a_left[22]); + b_left[23] = vsubq_s32(a_left[8], a_left[23]); + b_left[24] = vsubq_s32(a_left[7], a_left[24]); + b_left[25] = vsubq_s32(a_left[6], a_left[25]); + b_left[26] = vsubq_s32(a_left[5], a_left[26]); + b_left[27] = vsubq_s32(a_left[4], a_left[27]); + b_left[28] = vsubq_s32(a_left[3], a_left[28]); + b_left[29] = vsubq_s32(a_left[2], a_left[29]); + b_left[30] = vsubq_s32(a_left[1], a_left[30]); + b_left[31] = vsubq_s32(a_left[0], a_left[31]); + + b_right[16] = vsubq_s32(a_right[15], a_right[16]); + b_right[17] = vsubq_s32(a_right[14], a_right[17]); + b_right[18] = vsubq_s32(a_right[13], a_right[18]); + b_right[19] = vsubq_s32(a_right[12], a_right[19]); + b_right[20] = vsubq_s32(a_right[11], a_right[20]); + b_right[21] = vsubq_s32(a_right[10], a_right[21]); + b_right[22] = vsubq_s32(a_right[9], a_right[22]); + b_right[23] = vsubq_s32(a_right[8], a_right[23]); + b_right[24] = vsubq_s32(a_right[7], a_right[24]); + b_right[25] = vsubq_s32(a_right[6], a_right[25]); + b_right[26] = vsubq_s32(a_right[5], a_right[26]); + b_right[27] = vsubq_s32(a_right[4], a_right[27]); + b_right[28] = vsubq_s32(a_right[3], a_right[28]); + b_right[29] = vsubq_s32(a_right[2], a_right[29]); + b_right[30] = vsubq_s32(a_right[1], a_right[30]); + b_right[31] = vsubq_s32(a_right[0], a_right[31]); +} + +static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = add_round_shift_s32(left[0]); + left[1] = add_round_shift_s32(left[1]); + left[2] = add_round_shift_s32(left[2]); + left[3] = add_round_shift_s32(left[3]); + left[4] = add_round_shift_s32(left[4]); + left[5] = add_round_shift_s32(left[5]); + left[6] = add_round_shift_s32(left[6]); + left[7] = add_round_shift_s32(left[7]); + left[8] = add_round_shift_s32(left[8]); + left[9] = add_round_shift_s32(left[9]); + left[10] = add_round_shift_s32(left[10]); + left[11] = add_round_shift_s32(left[11]); + left[12] = add_round_shift_s32(left[12]); + left[13] = add_round_shift_s32(left[13]); + left[14] = add_round_shift_s32(left[14]); + left[15] = add_round_shift_s32(left[15]); + left[16] = add_round_shift_s32(left[16]); + left[17] = add_round_shift_s32(left[17]); + left[18] = add_round_shift_s32(left[18]); + left[19] = add_round_shift_s32(left[19]); + left[20] = add_round_shift_s32(left[20]); + left[21] = add_round_shift_s32(left[21]); + left[22] = add_round_shift_s32(left[22]); + left[23] = add_round_shift_s32(left[23]); + left[24] = add_round_shift_s32(left[24]); + left[25] = add_round_shift_s32(left[25]); + left[26] = add_round_shift_s32(left[26]); + left[27] = add_round_shift_s32(left[27]); + left[28] = add_round_shift_s32(left[28]); + left[29] = add_round_shift_s32(left[29]); + left[30] = add_round_shift_s32(left[30]); + left[31] = add_round_shift_s32(left[31]); + + right[0] = add_round_shift_s32(right[0]); + right[1] = add_round_shift_s32(right[1]); + right[2] = add_round_shift_s32(right[2]); + right[3] = add_round_shift_s32(right[3]); + right[4] = add_round_shift_s32(right[4]); + right[5] = add_round_shift_s32(right[5]); + right[6] = add_round_shift_s32(right[6]); + right[7] = add_round_shift_s32(right[7]); + right[8] = add_round_shift_s32(right[8]); + right[9] = add_round_shift_s32(right[9]); + right[10] = add_round_shift_s32(right[10]); + right[11] = add_round_shift_s32(right[11]); + right[12] = add_round_shift_s32(right[12]); + right[13] = add_round_shift_s32(right[13]); + right[14] = add_round_shift_s32(right[14]); + right[15] = add_round_shift_s32(right[15]); + right[16] = add_round_shift_s32(right[16]); + right[17] = add_round_shift_s32(right[17]); + right[18] = add_round_shift_s32(right[18]); + right[19] = add_round_shift_s32(right[19]); + right[20] = add_round_shift_s32(right[20]); + right[21] = add_round_shift_s32(right[21]); + right[22] = add_round_shift_s32(right[22]); + right[23] = add_round_shift_s32(right[23]); + right[24] = add_round_shift_s32(right[24]); + right[25] = add_round_shift_s32(right[25]); + right[26] = add_round_shift_s32(right[26]); + right[27] = add_round_shift_s32(right[27]); + right[28] = add_round_shift_s32(right[28]); + right[29] = add_round_shift_s32(right[29]); + right[30] = add_round_shift_s32(right[30]); + right[31] = add_round_shift_s32(right[31]); +} + +static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = sub_round_shift_s32(left[0]); + left[1] = sub_round_shift_s32(left[1]); + left[2] = sub_round_shift_s32(left[2]); + left[3] = sub_round_shift_s32(left[3]); + left[4] = sub_round_shift_s32(left[4]); + left[5] = sub_round_shift_s32(left[5]); + left[6] = sub_round_shift_s32(left[6]); + left[7] = sub_round_shift_s32(left[7]); + left[8] = sub_round_shift_s32(left[8]); + left[9] = sub_round_shift_s32(left[9]); + left[10] = sub_round_shift_s32(left[10]); + left[11] = sub_round_shift_s32(left[11]); + left[12] = sub_round_shift_s32(left[12]); + left[13] = sub_round_shift_s32(left[13]); + left[14] = sub_round_shift_s32(left[14]); + left[15] = sub_round_shift_s32(left[15]); + left[16] = sub_round_shift_s32(left[16]); + left[17] = sub_round_shift_s32(left[17]); + left[18] = sub_round_shift_s32(left[18]); + left[19] = sub_round_shift_s32(left[19]); + left[20] = sub_round_shift_s32(left[20]); + left[21] = sub_round_shift_s32(left[21]); + left[22] = sub_round_shift_s32(left[22]); + left[23] = sub_round_shift_s32(left[23]); + left[24] = sub_round_shift_s32(left[24]); + left[25] = sub_round_shift_s32(left[25]); + left[26] = sub_round_shift_s32(left[26]); + left[27] = sub_round_shift_s32(left[27]); + left[28] = sub_round_shift_s32(left[28]); + left[29] = sub_round_shift_s32(left[29]); + left[30] = sub_round_shift_s32(left[30]); + left[31] = sub_round_shift_s32(left[31]); + + right[0] = sub_round_shift_s32(right[0]); + right[1] = sub_round_shift_s32(right[1]); + right[2] = sub_round_shift_s32(right[2]); + right[3] = sub_round_shift_s32(right[3]); + right[4] = sub_round_shift_s32(right[4]); + right[5] = sub_round_shift_s32(right[5]); + right[6] = sub_round_shift_s32(right[6]); + right[7] = sub_round_shift_s32(right[7]); + right[8] = sub_round_shift_s32(right[8]); + right[9] = sub_round_shift_s32(right[9]); + right[10] = sub_round_shift_s32(right[10]); + right[11] = sub_round_shift_s32(right[11]); + right[12] = sub_round_shift_s32(right[12]); + right[13] = sub_round_shift_s32(right[13]); + right[14] = sub_round_shift_s32(right[14]); + right[15] = sub_round_shift_s32(right[15]); + right[16] = sub_round_shift_s32(right[16]); + right[17] = sub_round_shift_s32(right[17]); + right[18] = sub_round_shift_s32(right[18]); + right[19] = sub_round_shift_s32(right[19]); + right[20] = sub_round_shift_s32(right[20]); + right[21] = sub_round_shift_s32(right[21]); + right[22] = sub_round_shift_s32(right[22]); + right[23] = sub_round_shift_s32(right[23]); + right[24] = sub_round_shift_s32(right[24]); + right[25] = sub_round_shift_s32(right[25]); + right[26] = sub_round_shift_s32(right[26]); + right[27] = sub_round_shift_s32(right[27]); + right[28] = sub_round_shift_s32(right[28]); + right[29] = sub_round_shift_s32(right[29]); + right[30] = sub_round_shift_s32(right[30]); + right[31] = sub_round_shift_s32(right[31]); +} + +static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15])); + ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15])); + al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14])); + ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14])); + al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13])); + ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13])); + al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12])); + ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12])); + al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11])); + ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11])); + al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10])); + ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10])); + al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9])); + ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9])); + al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8])); + ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8])); + + al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8])); + ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8])); + al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9])); + ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9])); + al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10])); + ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10])); + al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11])); + ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11])); + al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12])); + ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12])); + al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13])); + ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13])); + al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14])); + ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14])); + al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15])); + ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15])); + + al[16] = add_round_shift_s32(left[16]); + ar[16] = add_round_shift_s32(right[16]); + al[17] = add_round_shift_s32(left[17]); + ar[17] = add_round_shift_s32(right[17]); + al[18] = add_round_shift_s32(left[18]); + ar[18] = add_round_shift_s32(right[18]); + al[19] = add_round_shift_s32(left[19]); + ar[19] = add_round_shift_s32(right[19]); + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[20] = add_round_shift_s32(al[20]); + ar[20] = add_round_shift_s32(ar[20]); + al[21] = add_round_shift_s32(al[21]); + ar[21] = add_round_shift_s32(ar[21]); + al[22] = add_round_shift_s32(al[22]); + ar[22] = add_round_shift_s32(ar[22]); + al[23] = add_round_shift_s32(al[23]); + ar[23] = add_round_shift_s32(ar[23]); + al[24] = add_round_shift_s32(al[24]); + ar[24] = add_round_shift_s32(ar[24]); + al[25] = add_round_shift_s32(al[25]); + ar[25] = add_round_shift_s32(ar[25]); + al[26] = add_round_shift_s32(al[26]); + ar[26] = add_round_shift_s32(ar[26]); + al[27] = add_round_shift_s32(al[27]); + ar[27] = add_round_shift_s32(ar[27]); + + al[28] = add_round_shift_s32(left[28]); + ar[28] = add_round_shift_s32(right[28]); + al[29] = add_round_shift_s32(left[29]); + ar[29] = add_round_shift_s32(right[29]); + al[30] = add_round_shift_s32(left[30]); + ar[30] = add_round_shift_s32(right[30]); + al[31] = add_round_shift_s32(left[31]); + ar[31] = add_round_shift_s32(right[31]); + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[16], al[23]); + br[16] = vaddq_s32(ar[16], ar[23]); + bl[17] = vaddq_s32(al[17], al[22]); + br[17] = vaddq_s32(ar[17], ar[22]); + bl[18] = vaddq_s32(al[18], al[21]); + br[18] = vaddq_s32(ar[18], ar[21]); + bl[19] = vaddq_s32(al[19], al[20]); + br[19] = vaddq_s32(ar[19], ar[20]); + + bl[20] = vsubq_s32(al[19], al[20]); + br[20] = vsubq_s32(ar[19], ar[20]); + bl[21] = vsubq_s32(al[18], al[21]); + br[21] = vsubq_s32(ar[18], ar[21]); + bl[22] = vsubq_s32(al[17], al[22]); + br[22] = vsubq_s32(ar[17], ar[22]); + bl[23] = vsubq_s32(al[16], al[23]); + br[23] = vsubq_s32(ar[16], ar[23]); + + bl[24] = vsubq_s32(al[31], al[24]); + br[24] = vsubq_s32(ar[31], ar[24]); + bl[25] = vsubq_s32(al[30], al[25]); + br[25] = vsubq_s32(ar[30], ar[25]); + bl[26] = vsubq_s32(al[29], al[26]); + br[26] = vsubq_s32(ar[29], ar[26]); + bl[27] = vsubq_s32(al[28], al[27]); + br[27] = vsubq_s32(ar[28], ar[27]); + + bl[28] = vaddq_s32(al[28], al[27]); + br[28] = vaddq_s32(ar[28], ar[27]); + bl[29] = vaddq_s32(al[29], al[26]); + br[29] = vaddq_s32(ar[29], ar[26]); + bl[30] = vaddq_s32(al[30], al[25]); + br[30] = vaddq_s32(ar[30], ar[25]); + bl[31] = vaddq_s32(al[31], al[24]); + br[31] = vaddq_s32(ar[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]); + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64, + -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64, + -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64, + &bl[2], &br[2], &bl[3], &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64, + &bl[14], &br[14], &bl[9], &br[9]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64, + -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64, + &al[4], &ar[4], &al[7], &ar[7]); + butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64, + &al[5], &ar[5], &al[6], &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]); + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64, + -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64, + cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64, + -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64, + &bl[8], &br[8], &bl[15], &br[15]); + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64, + cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]); + butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64, + cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64, + cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64, + cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64, + cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64, + cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64, + cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct4x4_neon.c index 2827791f1..3b9196fae 100644 --- a/libvpx/vpx_dsp/arm/fdct_neon.c +++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.c @@ -18,10 +18,10 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // input[M * stride] * 16 int16x4_t in[4]; in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); @@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - vpx_fdct4x4_pass1_neon(in); - } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); @@ -48,3 +47,39 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 1 * 8, out_23); } } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + static const int32x4_t const_1000 = { 1, 0, 0, 0 }; + const int32x4_t const_one = vdupq_n_s32(1); + + // input[M * stride] * 16 + int32x4_t in[4]; + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + in[0] = vaddq_s32(in[0], const_1000); + } + + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); + in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2); + in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2); + in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2); + + vst1q_s32(final_output, in[0]); + vst1q_s32(final_output + 4, in[1]); + vst1q_s32(final_output + 8, in[2]); + vst1q_s32(final_output + 12, in[3]); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/libvpx/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 000000000..de3db9774 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/libvpx/vpx_dsp/arm/fdct8x8_neon.c new file mode 100644 index 000000000..75ee6f223 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" + +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // stage 1 + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); + { + // from vpx_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); + // store results + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); + { + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/libvpx/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 000000000..d8fa60044 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_neon.h b/libvpx/vpx_dsp/arm/fdct_neon.h index 28d7d86bf..193594e3d 100644 --- a/libvpx/vpx_dsp/arm/fdct_neon.h +++ b/libvpx/vpx_dsp/arm/fdct_neon.h @@ -13,201 +13,411 @@ #include <arm_neon.h> -static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { - const int16x8_t input_01 = vcombine_s16(in[0], in[1]); - const int16x8_t input_32 = vcombine_s16(in[3], in[2]); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - in[0] = out_0; - in[1] = out_1; - in[2] = out_2; - in[3] = out_3; -} - -static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, - int16x8_t *out) { - const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); - const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); - const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); - const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); - const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); - const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); - const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); - const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } -} - -static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { - int16x8_t out[8]; - vpx_fdct8x8_pass1_notranspose_neon(in, out); - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - in[0] = r01_s16.val[0]; - in[1] = r01_s16.val[1]; - in[2] = r23_s16.val[0]; - in[3] = r23_s16.val[1]; - in[4] = r45_s16.val[0]; - in[5] = r45_s16.val[1]; - in[6] = r67_s16.val[0]; - in[7] = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); } + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant); + *add_lo = vmlaq_n_s32(a1, b_lo, constant); + *add_hi = vmlaq_n_s32(a2, b_hi, constant); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s16_s32_noround( + const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo, + const int16x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmull_n_s16(a_lo, constant1); + const int32x4_t a2 = vmull_n_s16(a_hi, constant1); + const int32x4_t a3 = vmull_n_s16(a_lo, constant2); + const int32x4_t a4 = vmull_n_s16(a_hi, constant2); + *add_lo = vmlal_n_s16(a1, b_lo, constant2); + *add_hi = vmlal_n_s16(a2, b_hi, constant2); + *sub_lo = vmlsl_n_s16(a3, b_lo, constant1); + *sub_hi = vmlsl_n_s16(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + *add_lo = vmlaq_n_s32(a1, b_lo, constant2); + *add_hi = vmlaq_n_s32(a2, b_hi, constant2); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant1); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); +} + #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c index 0a1cdca41..718dba0d9 100644 --- a/libvpx/vpx_dsp/arm/fdct_partial_neon.c +++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, output[0] = (tran_low_t)(sum >> 3); output[1] = 0; } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int32_t sum; + + int r = 0; + do { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b)); + r++; + } while (r < 16); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int32_t sum; + + int r = 0; + do { + const int16x8_t a0 = vld1q_s16(input); + const int16x8_t a1 = vld1q_s16(input + 8); + const int16x8_t a2 = vld1q_s16(input + 16); + const int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0)); + partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3)); + r++; + } while (r < 32); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c deleted file mode 100644 index d9161c6d3..000000000 --- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/txfm_common.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/arm/idct_neon.h" -#include "vpx_dsp/arm/fdct_neon.h" -#include "vpx_dsp/arm/mem_neon.h" - -void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, - int stride) { - int i; - // stage 1 - int16x8_t in[8]; - in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - vpx_fdct8x8_pass1_neon(in); - } // for - { - // from vpx_dct_sse2.c - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); - const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); - const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); - const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); - const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); - const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); - const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); - const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); - in[0] = vhsubq_s16(in[0], sign_in0); - in[1] = vhsubq_s16(in[1], sign_in1); - in[2] = vhsubq_s16(in[2], sign_in2); - in[3] = vhsubq_s16(in[3], sign_in3); - in[4] = vhsubq_s16(in[4], sign_in4); - in[5] = vhsubq_s16(in[5], sign_in5); - in[6] = vhsubq_s16(in[6], sign_in6); - in[7] = vhsubq_s16(in[7], sign_in7); - // store results - store_s16q_to_tran_low(final_output + 0 * 8, in[0]); - store_s16q_to_tran_low(final_output + 1 * 8, in[1]); - store_s16q_to_tran_low(final_output + 2 * 8, in[2]); - store_s16q_to_tran_low(final_output + 3 * 8, in[3]); - store_s16q_to_tran_low(final_output + 4 * 8, in[4]); - store_s16q_to_tran_low(final_output + 5 * 8, in[5]); - store_s16q_to_tran_low(final_output + 6 * 8, in[6]); - store_s16q_to_tran_low(final_output + 7 * 8, in[7]); - } -} diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c index 523a63c6f..f6b6d7e3c 100644 --- a/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, coeff += 8; } } + +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 32x32 to 16x64 and remove stride. + * Top left first. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (i = 0; i < 256; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vhaddq_s16(b0, b2); + const int16x8_t c1 = vhaddq_s16(b1, b3); + const int16x8_t c2 = vhsubq_s16(b0, b2); + const int16x8_t c3 = vhsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 256, c1); + store_s16q_to_tran_low(coeff + 512, c2); + store_s16q_to_tran_low(coeff + 768, c3); + + coeff += 8; + } +} diff --git a/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 000000000..b9f72a94c --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( + const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, + tran_low_t *dqcoeff_ptr) { + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE void highbd_quantize_8_neon( + const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift, + int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) { + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31); + const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31); + const int32x4_t coeff_0_abs = vabsq_s32(coeff_0); + const int32x4_t coeff_1_abs = vabsq_s32(coeff_1); + + // Calculate 2 masks of elements outside the bin + const int32x4_t zbin_mask_0 = + vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin)); + const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32( + vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1))); + + // Get the rounded values + const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round); + const int32x4_t rounded_1 = + vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1)); + + // (round * (quant << 15) * 2) >> 16 == (round * quant) + int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant); + int32x4_t qcoeff_tmp_1 = + vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1)); + + // Add rounded values + qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0); + qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1); + + // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift) + qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift); + qcoeff_tmp_1 = + vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1)); + + // Restore the sign bit. + qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign); + qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign); + + // Only keep the relevant coeffs + *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0); + *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1); +} + +static VPX_FORCE_INLINE int16x8_t +highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, + const int32x4_t quant_shift, const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr)); + int32x4_t round = vmovl_s16(vld1_s16(round_ptr)); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} + +static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32( + int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) { + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round, + const int32x4_t quant, const int32x4_t quant_shift, + const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} diff --git a/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/libvpx/vpx_dsp/arm/highbd_sad_neon.c new file mode 100644 index 000000000..ecb52ce5a --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); + sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); + sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), + vget_high_u16(ref_u16)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); + const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); + const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); + sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); + const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); + const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +#define highbd_sad4MxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sadMxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sad4MxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +// 4x4 +highbd_sad4MxN(4, 4) +highbd_sad4MxN_avg(4, 4) +highbd_sadMxNx4D(4, 4) + +// 4x8 +highbd_sad4MxN(4, 8) +highbd_sad4MxN_avg(4, 8) +highbd_sadMxNx4D(4, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxN_avg(8, 4) +highbd_sadMxNx4D(8, 4) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxN_avg(8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxN_avg(8, 16) +highbd_sadMxNx4D(8, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxN_avg(16, 8) +highbd_sadMxNx4D(16, 8) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxN_avg(16, 16) +highbd_sadMxNx4D(16, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxN_avg(16, 32) +highbd_sadMxNx4D(16, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxN_avg(32, 16) +highbd_sadMxNx4D(32, 16) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxN_avg(32, 32) +highbd_sadMxNx4D(32, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxN_avg(32, 64) +highbd_sadMxNx4D(32, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxN_avg(64, 32) +highbd_sadMxNx4D(64, 32) + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxN_avg(64, 64) +highbd_sadMxNx4D(64, 64) + /* clang-format on */ diff --git a/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/libvpx/vpx_dsp/arm/highbd_variance_neon.c new file mode 100644 index 000000000..96a35af01 --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + int i, j; + + if (w >= 8) { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j])); + const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j])); + const int32x4_t diff1_s32 = + vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16)); + const int32x4_t diff2_s32 = + vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16)); + const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32); + const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32); + sum_s32 = vaddq_s32(sum_s32, diff1_s32); + sum_s32 = vaddq_s32(sum_s32, diff2_s32); + sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32); + sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } else { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + assert(w >= 4); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 4) { + const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j])); + const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j])); + const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16); + const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32); + sum_s32 = vaddq_s32(sum_s32, diff_s32); + sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } +} + +static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); + + if (w < 32 && h < 32) { + highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum); + } else { + uint64_t sse_long = 0; + int64_t sum_long = 0; + int k, l; + for (k = 0; k + 16 <= h; k += 16) { + for (l = 0; l + 16 <= w; l += 16) { + uint64_t sse_tmp = 0; + int64_t sum_tmp = 0; + highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16, + 16, &sse_tmp, &sum_tmp); + sum_long += sum_tmp; + sse_long += sse_tmp; + } + src_ptr += 16 * src_stride; + ref_ptr += 16 * ref_stride; + } + *sum = sum_long; + *sse = sse_long; + } +} + +static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } + +static INLINE void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +static INLINE void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + uint32x4_t one_u32 = vdupq_n_u32(1); + if (width >= 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 8) { + const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); + const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); + const uint32x4_t sum1_u32 = + vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); + const uint32x4_t sum2_u32 = + vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); + const uint16x4_t sum1_u16 = + vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); + const uint16x4_t sum2_u16 = + vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); + const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); + vst1q_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { + assert(width >= 4); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 4) { + const uint16x4_t pred_u16 = vld1_u16(&pred[j]); + const uint16x4_t ref_u16 = vld1_u16(&ref[j]); + const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); + const uint16x4_t vcomp_pred = + vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); + vst1_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } +} + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h index 50aaa94fe..19cfc7c7f 100644 --- a/libvpx/vpx_dsp/arm/mem_neon.h +++ b/libvpx/vpx_dsp/arm/mem_neon.h @@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32 = vdup_n_u32(0); + uint32x2_t a_u32; if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vset_lane_u32(a, a_u32, 0); + a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32 = vdupq_n_u32(0); + uint32x4_t a_u32; if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vsetq_lane_u32(a, a_u32, 0); + a_u32 = vdupq_n_u32(a); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); @@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { buf += stride; vst1_lane_u32((uint32_t *)buf, a_u32, 1); } + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3, + const uint8x16_t s4, const uint8x16_t s5, + const uint8x16_t s6, const uint8x16_t s7) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c index bd7818a07..9c227d560 100644 --- a/libvpx/vpx_dsp/arm/quantize_neon.c +++ b/libvpx/vpx_dsp/arm/quantize_neon.c @@ -17,20 +17,57 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); -#if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -38,109 +75,59 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(zbin_ptr); + int16x8_t round = vld1q_s16(round_ptr); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vld1q_s16(zbin_ptr); - const int16x8_t round = vld1q_s16(round_ptr); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } n_coeffs -= 8; { - const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); - const int16x8_t round = vdupq_n_s16(round_ptr[1]); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; - n_coeffs -= 8; } while (n_coeffs > 0); } @@ -156,6 +143,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -164,7 +154,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); @@ -176,14 +166,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, #if CONFIG_VP9_HIGHBITDEPTH dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, + vst1q_s16(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -194,107 +221,57 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan; - (void)n_coeffs; // Because we will always calculate 32*32. + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } { - const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); - const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } @@ -310,4 +287,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; } diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c index 03f716c3d..5fc621aee 100644 --- a/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -20,9 +20,9 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, const void *const buf1) { uint32_t a; - uint32x2_t aa = vdup_n_u32(0); + uint32x2_t aa; memcpy(&a, buf0, 4); - aa = vset_lane_u32(a, aa, 0); + aa = vdup_n_u32(a); memcpy(&a, buf1, 4); aa = vset_lane_u32(a, aa, 1); return vreinterpret_u8_u32(aa); @@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint32x4_t *const sum) { @@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint16x8_t *const sum) { @@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, sad_512_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_2048_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_4096_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c index b1509d883..ad575d4aa 100644 --- a/libvpx/vpx_dsp/arm/sad_neon.c +++ b/libvpx/vpx_dsp/arm/sad_neon.c @@ -21,9 +21,15 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); +#if defined(__ARM_FEATURE_DOTPROD) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(dp); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -33,13 +39,34 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); +#if defined(__ARM_FEATURE_DOTPROD) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); + const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(prod); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { +#if defined(__ARM_FEATURE_DOTPROD) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -52,11 +79,31 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { +#if defined(__ARM_FEATURE_DOTPROD) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); + const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); + const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); + const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -72,8 +119,65 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t c_u8 = vld1_u8(second_pred); + const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); + const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD8XN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x2_t prod = \ + sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x2(prod); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x2_t prod = \ + sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x2(prod); \ + } + +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -124,11 +228,67 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD8XN(4) SAD8XN(8) SAD8XN(16) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t src_u8 = vld1q_u8(src_ptr); + const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + const uint8x16_t c_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD16XN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -182,11 +342,78 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD16XN(8) SAD16XN(16) SAD16XN(32) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t c_lo = vld1q_u8(second_pred); + const uint8x16_t c_hi = vld1q_u8(second_pred + 16); + const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); + const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +#define SAD32XN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } + +#else // defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -250,11 +477,81 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD32XN(16) SAD32XN(32) SAD32XN(64) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t c_0 = vld1q_u8(second_pred); + const uint8x16_t c_1 = vld1q_u8(second_pred + 16); + const uint8x16_t c_2 = vld1q_u8(second_pred + 32); + const uint8x16_t c_3 = vld1q_u8(second_pred + 48); + const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); + const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); + const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); + const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -332,6 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return vpadalq_u16(sum, abs_1); } } +#endif // defined(__ARM_FEATURE_DOTPROD) #define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c index a3befdc34..9328c3ed8 100644 --- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -17,168 +17,474 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/arm/mem_neon.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - // Process a block exactly 4 wide and a multiple of 2 high. -static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; i += 2) { - const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); - const uint8x8_t src_1 = - load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += 2 * src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); } // Process a block exactly 8 wide and any height. -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); } // Process a block which is a mutiple of 16 wide and any height. -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); - } - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); } -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -SUB_PIXEL_VARIANCENXM(4, 4) -SUB_PIXEL_VARIANCENXM(4, 8) -SUB_PIXEL_VARIANCENXM(8, 4) -SUB_PIXEL_VARIANCENXM(8, 8) -SUB_PIXEL_VARIANCENXM(8, 16) -SUB_PIXEL_VARIANCENXM(16, 8) -SUB_PIXEL_VARIANCENXM(16, 16) -SUB_PIXEL_VARIANCENXM(16, 32) -SUB_PIXEL_VARIANCENXM(32, 16) -SUB_PIXEL_VARIANCENXM(32, 32) -SUB_PIXEL_VARIANCENXM(32, 64) -SUB_PIXEL_VARIANCENXM(64, 32) -SUB_PIXEL_VARIANCENXM(64, 64) - -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \ + sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x<h> blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ } -SUB_PIXEL_AVG_VARIANCENXM(4, 4) -SUB_PIXEL_AVG_VARIANCENXM(4, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 4) -SUB_PIXEL_AVG_VARIANCENXM(8, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 8) -SUB_PIXEL_AVG_VARIANCENXM(16, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 16) -SUB_PIXEL_AVG_VARIANCENXM(32, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 64) -SUB_PIXEL_AVG_VARIANCENXM(64, 32) -SUB_PIXEL_AVG_VARIANCENXM(64, 64) +// 4x<h> blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) diff --git a/libvpx/vpx_dsp/arm/subtract_neon.c b/libvpx/vpx_dsp/arm/subtract_neon.c index 612897e24..2c008e48a 100644 --- a/libvpx/vpx_dsp/arm/subtract_neon.c +++ b/libvpx/vpx_dsp/arm/subtract_neon.c @@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, } while (r); } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r = rows, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + if (cols >= 16) { + do { + for (c = 0; c < cols; c += 16) { + const uint16x8_t s0 = vld1q_u16(&src[c + 0]); + const uint16x8_t s1 = vld1q_u16(&src[c + 8]); + const uint16x8_t p0 = vld1q_u16(&pred[c + 0]); + const uint16x8_t p1 = vld1q_u16(&pred[c + 8]); + const uint16x8_t d0 = vsubq_u16(s0, p0); + const uint16x8_t d1 = vsubq_u16(s1, p1); + vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 8) { + do { + for (c = 0; c < cols; c += 8) { + const uint16x8_t s = vld1q_u16(&src[c]); + const uint16x8_t p = vld1q_u16(&pred[c]); + const uint16x8_t d0 = vsubq_u16(s, p); + vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 4) { + do { + for (c = 0; c < cols; c += 4) { + const uint16x4_t s = vld1_u16(&src[c]); + const uint16x4_t p = vld1_u16(&pred[c]); + const uint16x4_t v_diff = vsub_u16(s, p); + vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h index c098ad31b..41d44f2b1 100644 --- a/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/libvpx/vpx_dsp/arm/transpose_neon.h @@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } +// Transpose 8x8 to a new location. +static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, @@ -787,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, a7->val[1] = c7.val[1]; } +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c index 7b93f142b..3ccc4e807 100644 --- a/libvpx/vpx_dsp/arm/variance_neon.c +++ b/libvpx/vpx_dsp/arm/variance_neon.c @@ -19,345 +19,357 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; i += 4) { - const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t abs_diff = vabdq_u8(a, b); + const uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); src_ptr += 4 * src_stride; ref_ptr += 4 * ref_stride; - } + i -= 4; + } while (i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a = vld1q_u8(src_ptr + j); - const uint8x16_t b = vld1q_u8(ref_ptr + j); + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - const uint8x16_t abs_diff = vabdq_u8(a, b); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); - } src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - uint32x2_t sum_a = vdup_n_u32(0); - uint32x2_t sum_b = vdup_n_u32(0); - uint32x2_t sse_lo_u32 = vdup_n_u32(0); - uint32x2_t sse_hi_u32 = vdup_n_u32(0); +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int i = h; do { - const uint8x8_t a_0 = vld1_u8(src_ptr); - const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0 = vld1_u8(ref_ptr); - const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); - - const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); - const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); - sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); - sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); - - sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); - sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); - - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; - } while (i < h); + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); - *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); - *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); } -#else +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} -// The variance helper functions use int16_t for sum. 8 values are accumulated -// and then added (at which point they expand up to int32_t). To avoid overflow, -// there can be no more than 32767 / 255 ~= 128 values accumulated in each -// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32 -// rows = 128. Asserts have been added to each function to warn against reaching -// this limit. +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} -// Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; +#else // !defined(__ARM_FEATURE_DOTPROD) + +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; - // Since width is only 4, sum_s16 only loads a half row per loop. + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. assert(h <= 256); - for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); + sum_s16 = vaddq_s16(sum_s16, diff); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - } + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); - - // The loop loads 16 values at a time but doubles them up when accumulating - // into sum_s16. - assert(w / 8 * h <= 128); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); - - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); - - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); - - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); - } + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; - // Each column has it's own accumulator entry in sum_s16. + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(src_ptr); - const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); - const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); - const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); - const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); - const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); - const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); - sum_s16 = vaddq_s16(sum_s16, diff_0_s16); - sum_s16 = vaddq_s16(sum_s16, diff_1_s16); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), - vget_low_s16(diff_0_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), - vget_low_s16(diff_1_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), - vget_high_s16(diff_0_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), - vget_high_s16(diff_1_s16)); - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; } while (i < h); - *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -#endif +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); +} + +#endif // defined(__ARM_FEATURE_DOTPROD) void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); } -#define VARIANCENXM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else if (n == 8) \ - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else \ - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ - &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } -VARIANCENXM(4, 4, 4) -VARIANCENXM(4, 8, 5) -VARIANCENXM(8, 4, 5) -VARIANCENXM(8, 8, 6) -VARIANCENXM(8, 16, 7) -VARIANCENXM(16, 8, 7) -VARIANCENXM(16, 16, 8) -VARIANCENXM(16, 32, 9) -VARIANCENXM(32, 16, 9) -VARIANCENXM(32, 32, 10) - -unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, - &sum1); - variance_neon_w16(src_ptr + (32 * src_stride), src_stride, - ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) -unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) -unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, - ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, - ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -421,7 +433,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return vget_lane_u32(sse, 0); } -#else +#else // !defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -518,4 +530,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } -#endif +#endif // defined(__ARM_FEATURE_DOTPROD) diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 06b58c438..b4cdd58c7 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,8 +31,9 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && \ + (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) + DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, @@ -53,9 +54,176 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b, +#if defined(__ARM_FEATURE_MATMUL_INT8) + +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23, dd01, dd23; + dd01 = vdup_n_u8(0); + dd23 = vdup_n_u8(0); + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, const uint8x16_t permute_tbl) { /* Transpose 8-bit elements and concatenate result rows as follows: * a0: 00, 01, 02, 03, XX, XX, XX, XX @@ -70,13 +238,13 @@ static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, * inline helper is called many times from the same parent function. */ - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); } -static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b0, int8x16_t *b1, +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, const uint8x16x2_t permute_tbl) { /* Transpose 8-bit elements and concatenate result rows as follows: * a0: 00, 01, 02, 03, 04, 05, 06, 07 @@ -92,11 +260,364 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, * inline helper is called many times from the same parent function. */ - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } } +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +#else // !defined(__ARM_FEATURE_MATMUL_INT8) + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -125,33 +646,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t t01, t23; uint8x8_t d01, d23; - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; - - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); d23 = vqrshrun_n_s16(t23, 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -166,20 +676,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -222,20 +730,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = vdup_n_u8(0); dd23 = vdup_n_u8(0); - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; - - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); @@ -243,17 +743,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -268,29 +766,25 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); - - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); - - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -303,6 +797,49 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -333,14 +870,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -357,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -375,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -384,22 +915,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -409,6 +933,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -426,14 +951,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -450,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -475,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -491,18 +1010,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -556,14 +1073,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23, dd01, dd23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -580,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -598,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -607,27 +1118,21 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -637,6 +1142,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -654,14 +1160,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -678,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -703,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -719,28 +1219,23 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -764,29 +1259,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#else - -static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, - const uint8x8_t s0, const uint8x8_t s1, - const uint8x8_t s2, const uint8x8_t s3, - const uint8x8_t s4, const uint8x8_t s5, - const uint8x8_t s6, const uint8x8_t s7) { - vst1_u8(s, s0); - s += p; - vst1_u8(s, s1); - s += p; - vst1_u8(s, s2); - s += p; - vst1_u8(s, s3); - s += p; - vst1_u8(s, s4); - s += p; - vst1_u8(s, s5); - s += p; - vst1_u8(s, s6); - s += p; - vst1_u8(s, s7); -} +#endif // defined(__ARM_FEATURE_MATMUL_INT8) + +#else // !(defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8))) void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -808,16 +1285,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -849,14 +1323,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -883,8 +1353,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -927,14 +1395,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); @@ -1002,22 +1466,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); @@ -1061,8 +1517,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1070,8 +1525,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -1103,14 +1556,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1140,8 +1589,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -1186,14 +1633,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); @@ -1276,22 +1719,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -1349,8 +1784,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -1387,14 +1820,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1417,8 +1846,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1469,14 +1896,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); vst1_u8(d, t0); d += dst_stride; @@ -1521,8 +1944,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1560,14 +1981,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1598,8 +2015,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1651,14 +2066,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vcombine_u8(t0, t1); d23 = vcombine_u8(t2, t3); @@ -1694,4 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif +#endif // #if defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h index 857b6d54e..ed7f18053 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -16,69 +16,12 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); -} - -static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3, - uint8x8_t *const s4, uint8x8_t *const s5, - uint8x8_t *const s6, uint8x8_t *const s7) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); - s += p; - *s4 = vld1_u8(s); - s += p; - *s5 = vld1_u8(s); - s += p; - *s6 = vld1_u8(s); - s += p; - *s7 = vld1_u8(s); -} +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) -static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, - uint8x16_t *const s0, uint8x16_t *const s1, - uint8x16_t *const s2, uint8x16_t *const s3, - uint8x16_t *const s4, uint8x16_t *const s5, - uint8x16_t *const s6, uint8x16_t *const s7) { - *s0 = vld1q_u8(s); - s += p; - *s1 = vld1q_u8(s); - s += p; - *s2 = vld1q_u8(s); - s += p; - *s3 = vld1q_u8(s); - s += p; - *s4 = vld1q_u8(s); - s += p; - *s5 = vld1q_u8(s); - s += p; - *s6 = vld1q_u8(s); - s += p; - *s7 = vld1q_u8(s); -} - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) - -static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum; @@ -90,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, return sum; } -static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { +static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[2]; int32x4_t sum; @@ -115,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, return sum; } -static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum0, sum1; int16x8_t sum; @@ -138,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, return vqrshrun_n_s16(sum, 7); } -static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { +static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[3]; int32x4_t sum0, sum1; int16x8_t sum; @@ -171,15 +114,98 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) + +#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) + +static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum; + + sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + int32x4_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + uint8x16_t permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, - const int16x8_t filters, - const int16x4_t filter3, - const int16x4_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x4_t sum; @@ -190,8 +216,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, sum = vmla_lane_s16(sum, s5, filters_hi, 1); sum = vmla_lane_s16(sum, s6, filters_hi, 2); sum = vmla_lane_s16(sum, s7, filters_hi, 3); - sum = vqadd_s16(sum, vmul_s16(s3, filter3)); - sum = vqadd_s16(sum, vmul_s16(s4, filter4)); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0)); return sum; } @@ -199,9 +225,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, - const int16x8_t filters, - const int16x8_t filter3, - const int16x8_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x8_t sum; @@ -212,15 +236,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); - sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); - sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); return vqrshrun_n_s16(sum, 7); } static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, const int16x8_t filters) { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int16x8_t ss[8]; ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); @@ -233,7 +255,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], - filters, filter3, filter4); + filters); } #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index 8edf8a66e..b8e3c5e54 100644 --- a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" @@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4( const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; @@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4( t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], - filters, filter3, filter4); + filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { @@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4( if (y_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x4_t t[8], tt; @@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4( t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, - filter3, filter4); + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } else { diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c index 1c45e8a73..954015407 100644 --- a/libvpx/vpx_dsp/avg.c +++ b/libvpx/vpx_dsp/avg.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include <assert.h> #include <stdlib.h> #include "./vpx_dsp_rtcd.h" @@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; + assert(height >= 2); for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; diff --git a/libvpx/vpx_dsp/bitwriter.h b/libvpx/vpx_dsp/bitwriter.h index 04084af8f..5f1ee69ec 100644 --- a/libvpx/vpx_dsp/bitwriter.h +++ b/libvpx/vpx_dsp/bitwriter.h @@ -13,6 +13,7 @@ #include <stdio.h> +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx_dsp/prob.h" @@ -35,7 +36,9 @@ typedef struct vpx_writer { void vpx_start_encode(vpx_writer *br, uint8_t *source); void vpx_stop_encode(vpx_writer *br); -static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { +static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br, + int bit, + int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; diff --git a/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/libvpx/vpx_dsp/loongarch/quantize_lsx.c index 2fc33b06b..77be0bb4f 100644 --- a/libvpx/vpx_dsp/loongarch/quantize_lsx.c +++ b/libvpx/vpx_dsp/loongarch/quantize_lsx.c @@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, } static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, - __m128i zbin_mask0, __m128i zbin_mask1, const int16_t *scan, int index, __m128i zero) { const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); @@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, __m128i scan1 = __lsx_vld(scan + index + 8, 0); __m128i eob0, eob1; - scan0 = __lsx_vsub_h(scan0, zbin_mask0); - scan1 = __lsx_vsub_h(scan1, zbin_mask1); eob0 = __lsx_vandn_v(zero_coeff0, scan0); eob1 = __lsx_vandn_v(zero_coeff1, scan1); return __lsx_vmax_h(eob0, eob1); @@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; @@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } diff --git a/libvpx/vpx_dsp/loopfilter.c b/libvpx/vpx_dsp/loopfilter.c index 995602831..d6504aab1 100644 --- a/libvpx/vpx_dsp/loopfilter.c +++ b/libvpx/vpx_dsp/loopfilter.c @@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { @@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op7, uint8_t *op6, +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, @@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c( bd); } -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { @@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c( bd); } -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint16_t *op7, uint16_t *op6, +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h index 3c2f50c79..d54ce5368 100644 --- a/libvpx/vpx_dsp/mips/macros_msa.h +++ b/libvpx/vpx_dsp/mips/macros_msa.h @@ -83,31 +83,33 @@ val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ - uint32_t val_lw_m; \ - \ - __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ - "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ - : [val_lw_m] "=&r"(val_lw_m) \ - : [psrc_lw_m] "r"(psrc_lw_m)); \ - \ - val_lw_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ - uint64_t val_ld_m = 0; \ - \ - __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ - "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ - : [val_ld_m] "=&r"(val_ld_m) \ - : [psrc_ld_m] "r"(psrc_ld_m)); \ - \ - val_ld_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ diff --git a/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libvpx/vpx_dsp/ppc/quantize_vsx.c index 7cdcbeb40..ab71f6e23 100644 --- a/libvpx/vpx_dsp/ppc/quantize_vsx.c +++ b/libvpx/vpx_dsp/ppc/quantize_vsx.c @@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } -static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, const int16_t *iscan_ptr, int index) { int16x8_t scan = vec_vsx_ld(index, iscan_ptr); bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); - scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); if (n_coeffs > 16) { int index = 16; @@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); - eob = - vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); index += 24; @@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vec_splat(dequant, 1); // remove DC from dequant vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); do { int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; @@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); - eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); // 24 int16_t is 48 bytes diff --git a/libvpx/vpx_dsp/psnr.c b/libvpx/vpx_dsp/psnr.c index 48bac0450..f0d4e927a 100644 --- a/libvpx/vpx_dsp/psnr.c +++ b/libvpx/vpx_dsp/psnr.c @@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; @@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c index 30b55dcb4..ce1e8382b 100644 --- a/libvpx/vpx_dsp/variance.c +++ b/libvpx/vpx_dsp/variance.c @@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { int i, j; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk index 13999af04..1fd9495cf 100644 --- a/libvpx/vpx_dsp/vpx_dsp.mk +++ b/libvpx/vpx_dsp/vpx_dsp.mk @@ -226,19 +226,19 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h -DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c endif # !CONFIG_VP9_HIGHBITDEPTH @@ -326,11 +326,14 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c endif # avg @@ -374,6 +377,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm @@ -388,6 +392,9 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS @@ -425,6 +432,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index d3c668f9a..8725821b6 100644 --- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -527,6 +527,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2 neon/; + specialize qw/vpx_highbd_fdct4x4_1 neon/; + $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 neon sse2/; @@ -550,27 +552,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct4x4 sse2/; + specialize qw/vpx_highbd_fdct4x4 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct8x8 sse2/; + specialize qw/vpx_highbd_fdct8x8 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct8x8_1 neon/; $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon; add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct16x16 sse2/; + specialize qw/vpx_highbd_fdct16x16 sse2 neon/; add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct16x16_1 neon/; add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32 sse2/; + specialize qw/vpx_highbd_fdct32x32 sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32_rd sse2/; + specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct32x32_1 neon/; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4 neon sse2 msa lsx/; @@ -711,17 +715,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b sse2/; + specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b_32x32 sse2/; + specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER @@ -730,7 +734,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/; +specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; # # Single block SAD @@ -795,7 +799,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_8x8 avx2/; @@ -819,7 +823,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; @@ -935,46 +939,49 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Block subtraction # add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/vpx_highbd_subtract_block neon avx2/; # # Single block SAD # add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x64 sse2/; + specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x32 sse2/; + specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x64 sse2/; + specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x32 sse2/; + specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x16 sse2/; + specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x32 sse2/; + specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x16 sse2/; + specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x8 sse2/; + specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x16 sse2/; + specialize qw/vpx_highbd_sad8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x8 sse2/; + specialize qw/vpx_highbd_sad8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x4 sse2/; + specialize qw/vpx_highbd_sad8x4 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x8 neon/; add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x4 neon/; # # Avg @@ -988,83 +995,85 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x64_avg sse2/; + specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x32_avg sse2/; + specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x64_avg sse2/; + specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x32_avg sse2/; + specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x16_avg sse2/; + specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x32_avg sse2/; + specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x16_avg sse2/; + specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x8_avg sse2/; + specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x16_avg sse2/; + specialize qw/vpx_highbd_sad8x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x8_avg sse2/; + specialize qw/vpx_highbd_sad8x8_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x4_avg sse2/; + specialize qw/vpx_highbd_sad8x4_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x8_avg neon/; add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x4_avg neon/; # # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x64x4d sse2/; + specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x32x4d sse2/; + specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x64x4d sse2/; + specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x32x4d sse2/; + specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x16x4d sse2/; + specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x32x4d sse2/; + specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x16x4d sse2/; + specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x8x4d sse2/; + specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x16x4d sse2/; + specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x8x4d sse2/; + specialize qw/vpx_highbd_sad8x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x4x4d sse2/; + specialize qw/vpx_highbd_sad8x4x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x8x4d sse2/; + specialize qw/vpx_highbd_sad4x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x4x4d sse2/; + specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; # # Structured Similarity (SSIM) @@ -1232,369 +1241,397 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x64 sse2/; + specialize qw/vpx_highbd_12_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x32 sse2/; + specialize qw/vpx_highbd_12_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x64 sse2/; + specialize qw/vpx_highbd_12_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x32 sse2/; + specialize qw/vpx_highbd_12_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x16 sse2/; + specialize qw/vpx_highbd_12_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x32 sse2/; + specialize qw/vpx_highbd_12_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x16 sse2/; + specialize qw/vpx_highbd_12_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x8 sse2/; + specialize qw/vpx_highbd_12_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x16 sse2/; + specialize qw/vpx_highbd_12_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x8 sse2/; + specialize qw/vpx_highbd_12_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x64 sse2/; + specialize qw/vpx_highbd_10_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x32 sse2/; + specialize qw/vpx_highbd_10_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x64 sse2/; + specialize qw/vpx_highbd_10_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x32 sse2/; + specialize qw/vpx_highbd_10_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x16 sse2/; + specialize qw/vpx_highbd_10_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x32 sse2/; + specialize qw/vpx_highbd_10_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x16 sse2/; + specialize qw/vpx_highbd_10_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x8 sse2/; + specialize qw/vpx_highbd_10_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x16 sse2/; + specialize qw/vpx_highbd_10_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x8 sse2/; + specialize qw/vpx_highbd_10_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x64 sse2/; + specialize qw/vpx_highbd_8_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x32 sse2/; + specialize qw/vpx_highbd_8_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x64 sse2/; + specialize qw/vpx_highbd_8_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x32 sse2/; + specialize qw/vpx_highbd_8_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x16 sse2/; + specialize qw/vpx_highbd_8_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x32 sse2/; + specialize qw/vpx_highbd_8_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x16 sse2/; + specialize qw/vpx_highbd_8_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x8 sse2/; + specialize qw/vpx_highbd_8_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x16 sse2/; + specialize qw/vpx_highbd_8_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x8 sse2/; + specialize qw/vpx_highbd_8_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x4 neon/; add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get16x16var sse2/; + specialize qw/vpx_highbd_8_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get8x8var sse2/; + specialize qw/vpx_highbd_8_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get16x16var sse2/; + specialize qw/vpx_highbd_10_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get8x8var sse2/; + specialize qw/vpx_highbd_10_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get16x16var sse2/; + specialize qw/vpx_highbd_12_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get8x8var sse2/; + specialize qw/vpx_highbd_12_get8x8var sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x16 sse2/; + specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x8 sse2/; + specialize qw/vpx_highbd_10_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x16 sse2/; + specialize qw/vpx_highbd_12_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x8 sse2/; + specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; + specialize qw/vpx_highbd_comp_avg_pred neon sse2/; # # Subpixel Variance # add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; } # CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c index 3f4f577a2..b2e01319d 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); - src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); @@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); - src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index 9da2f34c9..015c11a1f 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { s0 = _mm_add_epi32(s0, s1); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); - avg = _mm_cvtsi128_si32(s0); + avg = (unsigned int)_mm_cvtsi128_si32(s0); return (avg + 32) >> 6; } @@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h index 99bc9637f..ebee964b1 100644 --- a/libvpx/vpx_dsp/x86/convolve_avx2.h +++ b/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, __m128i *const dst_ptr_2, const __m256i *const src) { - *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); - *((uint32_t *)(dst_ptr_2)) = - _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); + *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); } static INLINE __m256i mm256_round_epi32(const __m256i *const src, diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 3f158b5e4..f3a802029 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); - const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kZero = _mm256_setzero_si256(); const __m256i kOne = _mm256_set1_epi16(1); // Do the two transform/transpose passes int pass; diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index ac1246faa..bf350b6da 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 78cf9111d..1d07391b0 100644 --- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, const int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); // Faster than _mm_set1_epi16((1 << bd) - 1). const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index d265fc1a9..9f45623de 100644 --- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i lbounded; __m128i retval; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i t80, max, min; @@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; @@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); @@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 000000000..8edddd637 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" + +static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE void quantize_b_32x32( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const unsigned int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 4535a0f7a..ae1981a83 100644 --- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; @@ -82,13 +82,14 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } - *eob_ptr = eob_i + 1; + *eob_ptr = eob_i; } void vpx_highbd_quantize_b_32x32_sse2( @@ -101,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2( __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; - int i, eob = -1; + int i, eob = 0; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; @@ -143,10 +144,10 @@ void vpx_highbd_quantize_b_32x32_sse2( const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } - *eob_ptr = eob + 1; + *eob_ptr = eob; } #endif diff --git a/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 000000000..947b5e977 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + int x; + + for (x = 0; x < 4; ++x) { + __m256i r[4]; + r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); + + // sum every abs diff + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); + } + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD64XNX4D(n) \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 2); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 1; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 64x64 +HIGHBD_SAD64XNX4D(64) + +// 64x32 +HIGHBD_SAD64XNX4D(32) + +static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); + r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); + r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); + r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); + r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); + r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); + r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); + sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); + sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); + sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD32XNX4D(n) \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 8); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 3; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 32x64 +HIGHBD_SAD32XNX4D(64) + +// 32x32 +HIGHBD_SAD32XNX4D(32) + +// 32x16 +HIGHBD_SAD32XNX4D(16) + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < 2; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 000000000..231b67f80 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN(64) + +// 64x32 +HIGHBD_SAD64XN(32) + +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN(64) + +// 32x32 +HIGHBD_SAD32XN(32) + +// 32x16 +HIGHBD_SAD32XN(16) + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +// AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); + const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + const __m256i avg2 = _mm256_avg_epu16(r2, x2); + const __m256i avg3 = _mm256_avg_epu16(r3, x3); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + sec += 64; + } +} + +#define HIGHBD_SAD64XN_AVG(n) \ + unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + sec += 64 << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN_AVG(64) + +// 64x32 +HIGHBD_SAD64XN_AVG(32) + +static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + sec += 32; + } +} + +#define HIGHBD_SAD32XN_AVG(n) \ + unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + sec += 32 << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN_AVG(64) + +// 32x32 +HIGHBD_SAD32XN_AVG(32) + +// 32x16 +HIGHBD_SAD32XN_AVG(16) + +static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + sec += 32; + } +} + +unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + sec += 16 << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index 7c8d79b09..381e0ad19 100644 --- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -559,3 +560,49 @@ FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index 4b02da966..f42b3df84 100644 --- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); __m128i s[8], u[16], v[8], w[16]; // transpose @@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); diff --git a/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libvpx/vpx_dsp/x86/loopfilter_avx2.c index be391992a..a58fb6553 100644 --- a/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ b/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; @@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c index 347c9fdbe..6ea34cdd1 100644 --- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); @@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2( DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), _mm_load_si128((const __m128i *)blimit1)); @@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, const __m128i thresh = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), _mm_load_si128((const __m128i *)thresh1)); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h index 8b6d4d1dd..031f361a4 100644 --- a/libvpx/vpx_dsp/x86/mem_sse2.h +++ b/libvpx/vpx_dsp/x86/mem_sse2.h @@ -27,13 +27,13 @@ static INLINE int32_t loadu_int32(const void *src) { } static INLINE __m128i load_unaligned_u32(const void *a) { - uint32_t val; + int val; memcpy(&val, a, sizeof(val)); return _mm_cvtsi32_si128(val); } static INLINE void store_unaligned_u32(void *const a, const __m128i v) { - const uint32_t val = _mm_cvtsi128_si32(v); + const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); } diff --git a/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libvpx/vpx_dsp/x86/post_proc_sse2.c index d1029afc4..119fa7cd1 100644 --- a/libvpx/vpx_dsp/x86/post_proc_sse2.c +++ b/libvpx/vpx_dsp/x86/post_proc_sse2.c @@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, __m128i s = _mm_loadl_epi64((__m128i *)dst); __m128i sum, sumsq_0, sumsq_1; __m128i tmp_0, tmp_1; - __m128i below_context; + __m128i below_context = _mm_setzero_si128(); s = _mm_unpacklo_epi8(s, zero); diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c index 706e4e641..7d8352721 100644 --- a/libvpx/vpx_dsp/x86/quantize_avx.c +++ b/libvpx/vpx_dsp/x86/quantize_avx.c @@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } @@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libvpx/vpx_dsp/x86/quantize_avx2.c b/libvpx/vpx_dsp/x86/quantize_avx2.c new file mode 100644 index 000000000..28f7c9c7d --- /dev/null +++ b/libvpx/vpx_dsp/x86/quantize_avx2.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift, int log_scale) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } + + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static VPX_FORCE_INLINE __m256i +load_coefficients_avx2(const tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +#else + // typedef int16_t tran_low_t; + return _mm256_loadu_si256((const __m256i *)coeff_ptr); +#endif +} + +static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +#else + // typedef int16_t tran_low_t; + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); +#endif +} + +static VPX_FORCE_INLINE __m256i +quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif // CONFIG_VP9_HIGHBITDEPTH + return _mm256_setzero_si256(); + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); + + const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); + const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); +#else + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); +#endif + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); +#endif + return v_nz_mask; + } +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 0); + // Do DC and first 15 AC. + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, + __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, + __m256i *v_quant_shift, __m256i *v_eobmax) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif + return *v_eobmax; + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> 15); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = + _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); + const __m256i v_tmp32_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_sign_lo = + _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i v_sign_hi = + _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); + const __m256i v_dqcoeff_lo = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); + const __m256i v_dqcoeff_hi = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), + dqcoeff_ptr); +#endif + + return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); + } +} + +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + (void)n_coeffs; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 1); + + // Do DC and first 15 AC. + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = (32 * 32) - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c index 459d95f28..9533e7916 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.h b/libvpx/vpx_dsp/x86/quantize_sse2.h index afe2f924b..27bfb4e41 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.h +++ b/libvpx/vpx_dsp/x86/quantize_sse2.h @@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, + __m128i *dequant) { + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); +} + // With ssse3 and later abs() and sign() are preferred. static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); @@ -62,11 +71,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, #endif // CONFIG_VP9_HIGHBITDEPTH } -// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to -// zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, - const __m128i zbin_mask0, - const __m128i zbin_mask1, const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); @@ -74,9 +80,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; - // Add one to convert from indices to counts - scan0 = _mm_sub_epi16(scan0, zbin_mask0); - scan1 = _mm_sub_epi16(scan1, zbin_mask1); eob0 = _mm_andnot_si128(zero_coeff0, scan0); eob1 = _mm_andnot_si128(zero_coeff1, scan1); return _mm_max_epi16(eob0, eob1); diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c index 9d2a88b7b..476230286 100644 --- a/libvpx/vpx_dsp/x86/quantize_ssse3.c +++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c index 3b48acd51..29bedb0e6 100644 --- a/libvpx/vpx_dsp/x86/sad_avx2.c +++ b/libvpx/vpx_dsp/x86/sad_avx2.c @@ -14,7 +14,7 @@ #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -35,8 +35,7 @@ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSAD32_H(h) \ @@ -92,7 +91,7 @@ FSAD32 unsigned int vpx_sad64x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -118,15 +117,14 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG32_H(h) \ unsigned int vpx_sad32x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -156,8 +154,7 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG64 \ diff --git a/libvpx/vpx_dsp/x86/subtract_avx2.c b/libvpx/vpx_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4849581ed --- /dev/null +++ b/libvpx/vpx_dsp/x86/subtract_avx2.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, + const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static VPX_FORCE_INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(s); + const __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + default: + vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + if (cols == 64) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); + const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); + const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + const __m256i d2 = _mm256_sub_epi16(s2, p2); + const __m256i d3 = _mm256_sub_epi16(s3, p3); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); + _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 32) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 16) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = + _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = + _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else if (cols == 8) { + int j = rows; + do { + const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i s1 = + _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storeu_si128((__m128i *)diff_ptr, d0); + _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else { + int j = rows; + assert(cols == 4); + do { + const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storel_epi64((__m128i *)diff_ptr, d0); + _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libvpx/vpx_dsp/x86/sum_squares_sse2.c index 14f3b35c0..df6514b2c 100644 --- a/libvpx/vpx_dsp/x86/sum_squares_sse2.c +++ b/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { } else { // Generic case int r = size; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); __m128i v_acc_q = _mm_setzero_si128(); assert(size % 8 == 0); diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c index 9232acbfb..35925d590 100644 --- a/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/libvpx/vpx_dsp/x86/variance_avx2.c @@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, return sum; } -static unsigned int sub_pixel_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, int height, unsigned int *sse) { +static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, NULL, 0, 0, height, sse); } -static unsigned int sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *second_pred, - int second_stride, int height, unsigned int *sse) { +static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int height, + unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, second_pred, second_stride, 1, height, sse); } diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c index a67c92aad..d6eb12da1 100644 --- a/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libvpx/vpx_dsp/x86/variance_sse2.c @@ -19,7 +19,7 @@ static INLINE unsigned int add32x4_sse2(__m128i val) { val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); - return _mm_cvtsi128_si32(val); + return (unsigned int)_mm_cvtsi128_si32(val); } unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { @@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_unpacklo_epi16(vsum, vsum); vsum = _mm_srai_epi32(vsum, 16); - *sum = add32x4_sse2(vsum); + *sum = (int)add32x4_sse2(vsum); } static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { @@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { // Can handle 1024 pixels' diff sum (such as 32x32) static INLINE int sum_final_sse2(const __m128i sum) { const __m128i t = sum_to_32bit_sse2(sum); - return add32x4_sse2(t); + return (int)add32x4_sse2(t); } static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, @@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 0cbd151dc..21a35ae3c 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); // Save only half of the register (8 words) - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); // Update the source by two rows src_ptr += src_stride_unrolled; diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 6f2983a4b..c7d880860 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); } + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + for (i = output_height; i > 1; i -= 2) { __m256i srcRegHead2, srcRegHead3; @@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = s2[3]; srcRegHead1 = srcRegHead3; } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - // load the last 16 bytes - const __m128i srcRegHead2 = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - s1[0] = _mm256_castsi128_si256( - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - s2[0] = _mm256_castsi128_si256( - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - - outReg1 = convolve8_8_avx2(s1, f); - outReg2 = convolve8_8_avx2(s2, f); - - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively - outReg1 = _mm_packus_epi16(outReg1, outReg2); - - // average if necessary - if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - } } static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, @@ -798,7 +772,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst); } } diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index ed46d6245..4ea2752d3 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, reg_1 = _mm_packus_epi16(reg_1, reg_1); // Save the result - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); // Update the source by two rows src_ptr += src_stride_unrolled; |