aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vpx_dsp
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-04-04 22:17:52 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-04-04 22:17:52 +0000
commitad0eb6a76e32cae294819880ee07105c158982a1 (patch)
tree1dd29cedbba6b704f1dc6447d964096541a500f7 /libvpx/vpx_dsp
parent79e287e28ce7149c4d059225c8f3ba9b3304c972 (diff)
parentc1d09c0300c54c9f0c78efb6d82a83f1fcd8af56 (diff)
downloadlibvpx-aml_sdk_331812000.tar.gz
Snap for 9883729 from c1d09c0300c54c9f0c78efb6d82a83f1fcd8af56 to mainline-sdkext-releaseaml_sdk_331812000aml_sdk_331811000android13-mainline-sdkext-release
Change-Id: Ifb36af904bfbcd93f375eee1b6c54a2d25579953
Diffstat (limited to 'libvpx/vpx_dsp')
-rw-r--r--libvpx/vpx_dsp/arm/fdct16x16_neon.c81
-rw-r--r--libvpx/vpx_dsp/arm/fdct16x16_neon.h587
-rw-r--r--libvpx/vpx_dsp/arm/fdct32x32_neon.c1574
-rw-r--r--libvpx/vpx_dsp/arm/fdct32x32_neon.h2919
-rw-r--r--libvpx/vpx_dsp/arm/fdct4x4_neon.c (renamed from libvpx/vpx_dsp/arm/fdct_neon.c)43
-rw-r--r--libvpx/vpx_dsp/arm/fdct4x4_neon.h105
-rw-r--r--libvpx/vpx_dsp/arm/fdct8x8_neon.c143
-rw-r--r--libvpx/vpx_dsp/arm/fdct8x8_neon.h381
-rw-r--r--libvpx/vpx_dsp/arm/fdct_neon.h602
-rw-r--r--libvpx/vpx_dsp/arm/fdct_partial_neon.c65
-rw-r--r--libvpx/vpx_dsp/arm/fwd_txfm_neon.c68
-rw-r--r--libvpx/vpx_dsp/arm/hadamard_neon.c42
-rw-r--r--libvpx/vpx_dsp/arm/highbd_quantize_neon.c307
-rw-r--r--libvpx/vpx_dsp/arm/highbd_sad_neon.c225
-rw-r--r--libvpx/vpx_dsp/arm/highbd_variance_neon.c496
-rw-r--r--libvpx/vpx_dsp/arm/mem_neon.h165
-rw-r--r--libvpx/vpx_dsp/arm/quantize_neon.c287
-rw-r--r--libvpx/vpx_dsp/arm/sad4d_neon.c25
-rw-r--r--libvpx/vpx_dsp/arm/sad_neon.c298
-rw-r--r--libvpx/vpx_dsp/arm/subpel_variance_neon.c608
-rw-r--r--libvpx/vpx_dsp/arm/subtract_neon.c56
-rw-r--r--libvpx/vpx_dsp/arm/transpose_neon.h79
-rw-r--r--libvpx/vpx_dsp/arm/variance_neon.c548
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_neon.c1101
-rw-r--r--libvpx/vpx_dsp/arm/vpx_convolve8_neon.h206
-rw-r--r--libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c10
-rw-r--r--libvpx/vpx_dsp/avg.c3
-rw-r--r--libvpx/vpx_dsp/bitwriter.h5
-rw-r--r--libvpx/vpx_dsp/loongarch/quantize_lsx.c13
-rw-r--r--libvpx/vpx_dsp/loopfilter.c12
-rw-r--r--libvpx/vpx_dsp/mips/macros_msa.h46
-rw-r--r--libvpx/vpx_dsp/ppc/quantize_vsx.c24
-rw-r--r--libvpx/vpx_dsp/psnr.c67
-rw-r--r--libvpx/vpx_dsp/variance.c6
-rw-r--r--libvpx/vpx_dsp/vpx_dsp.mk14
-rw-r--r--libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl347
-rw-r--r--libvpx/vpx_dsp/x86/avg_intrin_avx2.c4
-rw-r--r--libvpx/vpx_dsp/x86/avg_intrin_sse2.c4
-rw-r--r--libvpx/vpx_dsp/x86/convolve_avx2.h5
-rw-r--r--libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h2
-rw-r--r--libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h2
-rw-r--r--libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h2
-rw-r--r--libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c8
-rw-r--r--libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c258
-rw-r--r--libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c13
-rw-r--r--libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c401
-rw-r--r--libvpx/vpx_dsp/x86/highbd_sad_avx2.c468
-rw-r--r--libvpx/vpx_dsp/x86/highbd_variance_sse2.c47
-rw-r--r--libvpx/vpx_dsp/x86/inv_txfm_sse2.c4
-rw-r--r--libvpx/vpx_dsp/x86/loopfilter_avx2.c4
-rw-r--r--libvpx/vpx_dsp/x86/loopfilter_sse2.c14
-rw-r--r--libvpx/vpx_dsp/x86/mem_sse2.h4
-rw-r--r--libvpx/vpx_dsp/x86/post_proc_sse2.c2
-rw-r--r--libvpx/vpx_dsp/x86/quantize_avx.c12
-rw-r--r--libvpx/vpx_dsp/x86/quantize_avx2.c293
-rw-r--r--libvpx/vpx_dsp/x86/quantize_sse2.c5
-rw-r--r--libvpx/vpx_dsp/x86/quantize_sse2.h17
-rw-r--r--libvpx/vpx_dsp/x86/quantize_ssse3.c11
-rw-r--r--libvpx/vpx_dsp/x86/sad_avx2.c15
-rw-r--r--libvpx/vpx_dsp/x86/subtract_avx2.c203
-rw-r--r--libvpx/vpx_dsp/x86/sum_squares_sse2.c2
-rw-r--r--libvpx/vpx_dsp/x86/variance_avx2.c17
-rw-r--r--libvpx/vpx_dsp/x86/variance_sse2.c12
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c6
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c34
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c6
66 files changed, 10270 insertions, 3163 deletions
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
index 67f43246a..a458ecaa4 100644
--- a/libvpx/vpx_dsp/arm/fdct16x16_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -35,22 +35,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp3[16];
// Left half.
- load(input, stride, temp0);
- cross_input(temp0, temp1, 0);
- vpx_fdct16x16_body(temp1, temp0);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp1);
+ vpx_fdct8x16_body(temp1, temp0);
// Right half.
- load(input + 8, stride, temp1);
- cross_input(temp1, temp2, 0);
- vpx_fdct16x16_body(temp2, temp1);
+ load_cross(input + 8, stride, temp1);
+ scale_input(temp1, temp2);
+ vpx_fdct8x16_body(temp2, temp1);
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
- transpose_8x8(&temp0[0], &temp2[0]);
- transpose_8x8(&temp1[0], &temp2[8]);
+
+ transpose_s16_8x8_new(&temp0[0], &temp2[0]);
+ transpose_s16_8x8_new(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
- cross_input(temp2, temp3, 1);
- vpx_fdct16x16_body(temp3, temp2);
+ cross_input(temp2, temp3);
+ vpx_fdct8x16_body(temp3, temp2);
transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
&temp2[5], &temp2[6], &temp2[7]);
transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -61,12 +62,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
- transpose_8x8(&temp0[8], &temp1[0]);
+ transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);
partial_round_shift(temp1);
- cross_input(temp1, temp0, 1);
- vpx_fdct16x16_body(temp0, temp1);
+ cross_input(temp1, temp0);
+ vpx_fdct8x16_body(temp0, temp1);
transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
&temp1[5], &temp1[6], &temp1[7]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -74,5 +76,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output, temp1);
store(output + 8, temp1 + 8);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[16];
+ int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+ right3[16], right4[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // right half.
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+ highbd_partial_round_shift(left3, right3);
+ highbd_cross_input(left3, right3, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+
+ highbd_partial_round_shift(left4, right4);
+ highbd_cross_input(left4, right4, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+ store16_s32(output, left3);
+ output += 4;
+ store16_s32(output, right3);
+ output += 4;
+
+ store16_s32(output, left4);
+ output += 4;
+ store16_s32(output, right4);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
// __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
index 0dd21153f..43d820b6b 100644
--- a/libvpx/vpx_dsp/arm/fdct16x16_neon.h
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -13,6 +13,8 @@
#include <arm_neon.h>
+#include "fdct_neon.h"
+
static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
b[0] = vld1q_s16(a);
a += stride;
@@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
// To maybe reduce register usage this could be combined with the load() step to
// get the first 4 and last 4 values, cross those, then load the middle 8 values
// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vshlq_n_s16(a[0], 2);
+ b[1] = vshlq_n_s16(a[1], 2);
+ b[2] = vshlq_n_s16(a[2], 2);
+ b[3] = vshlq_n_s16(a[3], 2);
+ b[4] = vshlq_n_s16(a[4], 2);
+ b[5] = vshlq_n_s16(a[5], 2);
+ b[6] = vshlq_n_s16(a[6], 2);
+ b[7] = vshlq_n_s16(a[7], 2);
+
+ b[8] = vshlq_n_s16(a[8], 2);
+ b[9] = vshlq_n_s16(a[9], 2);
+ b[10] = vshlq_n_s16(a[10], 2);
+ b[11] = vshlq_n_s16(a[11], 2);
+ b[12] = vshlq_n_s16(a[12], 2);
+ b[13] = vshlq_n_s16(a[13], 2);
+ b[14] = vshlq_n_s16(a[14], 2);
+ b[15] = vshlq_n_s16(a[15], 2);
+}
+
static INLINE void cross_input(const int16x8_t *a /*[16]*/,
- int16x8_t *b /*[16]*/, const int pass) {
- if (pass == 0) {
- b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
- b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
- b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
- b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
- b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
- b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
- b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
- b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
- b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
- b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
- b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
- b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
- b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
- b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
- b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
- b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
- } else {
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
- }
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+ b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+ b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
}
// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
@@ -135,84 +159,9 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
}
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t c, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t c0,
- const tran_coef_t c1, int16x8_t *add,
- int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
- int16x8_t *b /*[8]*/) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
-}
-
// Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
- int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
int16x8_t s[8];
int16x8_t x[4];
int16x8_t step[8];
@@ -237,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
// out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
- butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
- // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
// out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
- butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
// Stage 2
// Re-using source s5/s6
// s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
// s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
- butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
// Stage 3
x[0] = vaddq_s16(s[4], s[5]);
@@ -255,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
x[3] = vaddq_s16(s[7], s[6]);
// Stage 4
- // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
- // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
- butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
- // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
- // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
- butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
// step 2
// From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -272,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
// step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
// step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
// step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
- butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
- butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+ butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
// step 3
s[0] = vaddq_s16(in[8], s[3]);
@@ -286,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
s[7] = vaddq_s16(in[15], s[4]);
// step 4
- // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
- // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
- butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
// step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
- // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
- butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
// step 5
step[0] = vaddq_s16(s[0], s[1]);
@@ -305,23 +257,368 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
step[7] = vaddq_s16(s[7], s[6]);
// step 6
- // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
- // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
- // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
- // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
- // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
- // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
- // cospi_22_64)
- // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
- // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
- butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
&out[7]);
- butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
&out[15]);
- butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
&out[3]);
- butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
&out[11]);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+ int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+ int32x4_t *a_right /*[16]*/,
+ int32x4_t *b_left /*[16]*/,
+ int32x4_t *b_right /*[16]*/) {
+ b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+ b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+ b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+ b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+ b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+ b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+ b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+ b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+ b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+ b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+ b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+ b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+ b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+ b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+ b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+ b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+ b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ const int32x4_t one = vdupq_n_s32(1);
+ left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+ left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+ left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+ left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+ left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+ left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+ left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+ left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+ left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+ left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+ left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+ left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+ left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+ left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+ left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+ left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+ right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+ right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+ right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+ right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+ right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+ right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+ right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+ right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+ right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+ right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+ right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+ right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+ right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+ right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+ right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+ right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+ vst1q_s32(a, b[0]);
+ a += 16;
+ vst1q_s32(a, b[1]);
+ a += 16;
+ vst1q_s32(a, b[2]);
+ a += 16;
+ vst1q_s32(a, b[3]);
+ a += 16;
+ vst1q_s32(a, b[4]);
+ a += 16;
+ vst1q_s32(a, b[5]);
+ a += 16;
+ vst1q_s32(a, b[6]);
+ a += 16;
+ vst1q_s32(a, b[7]);
+ a += 16;
+ vst1q_s32(a, b[8]);
+ a += 16;
+ vst1q_s32(a, b[9]);
+ a += 16;
+ vst1q_s32(a, b[10]);
+ a += 16;
+ vst1q_s32(a, b[11]);
+ a += 16;
+ vst1q_s32(a, b[12]);
+ a += 16;
+ vst1q_s32(a, b[13]);
+ a += 16;
+ vst1q_s32(a, b[14]);
+ a += 16;
+ vst1q_s32(a, b[15]);
+}
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ int32x4_t sl[8];
+ int32x4_t sr[8];
+ int32x4_t xl[4];
+ int32x4_t xr[4];
+ int32x4_t inl[8];
+ int32x4_t inr[8];
+ int32x4_t stepl[8];
+ int32x4_t stepr[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // Copy values 8-15 as we're storing in-place
+ inl[0] = left[8];
+ inr[0] = right[8];
+ inl[1] = left[9];
+ inr[1] = right[9];
+ inl[2] = left[10];
+ inr[2] = right[10];
+ inl[3] = left[11];
+ inr[3] = right[11];
+ inl[4] = left[12];
+ inr[4] = right[12];
+ inl[5] = left[13];
+ inr[5] = right[13];
+ inl[6] = left[14];
+ inr[6] = right[14];
+ inl[7] = left[15];
+ inr[7] = right[15];
+
+ // fdct4(step, step);
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[8], &right[8]);
+
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[4], &right[4],
+ &left[12], &right[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+ &sr[6], &sl[5], &sr[5]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], sl[5]);
+ xr[0] = vaddq_s32(sr[4], sr[5]);
+ xl[1] = vsubq_s32(sl[4], sl[5]);
+ xr[1] = vsubq_s32(sr[4], sr[5]);
+ xl[2] = vsubq_s32(sl[7], sl[6]);
+ xr[2] = vsubq_s32(sr[7], sr[6]);
+ xl[3] = vaddq_s32(sl[7], sl[6]);
+ xr[3] = vaddq_s32(sr[7], sr[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[2], &right[2],
+ &left[14], &right[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[10], &right[10],
+ &left[6], &right[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+ &sl[5], &sr[5], &sl[2], &sr[2]);
+ butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+ &sl[4], &sr[4], &sl[3], &sr[3]);
+
+ // step 3
+ sl[0] = vaddq_s32(inl[0], sl[3]);
+ sr[0] = vaddq_s32(inr[0], sr[3]);
+ sl[1] = vaddq_s32(inl[1], sl[2]);
+ sr[1] = vaddq_s32(inr[1], sr[2]);
+ xl[0] = vsubq_s32(inl[1], sl[2]);
+ xr[0] = vsubq_s32(inr[1], sr[2]);
+ xl[1] = vsubq_s32(inl[0], sl[3]);
+ xr[1] = vsubq_s32(inr[0], sr[3]);
+ xl[2] = vsubq_s32(inl[7], sl[4]);
+ xr[2] = vsubq_s32(inr[7], sr[4]);
+ xl[3] = vsubq_s32(inl[6], sl[5]);
+ xr[3] = vsubq_s32(inr[6], sr[5]);
+ sl[6] = vaddq_s32(inl[6], sl[5]);
+ sr[6] = vaddq_s32(inr[6], sr[5]);
+ sl[7] = vaddq_s32(inl[7], sl[4]);
+ sr[7] = vaddq_s32(inr[7], sr[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+ cospi_24_64, &sl[6], &sr[6], &sl[1],
+ &sr[1]);
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+ cospi_8_64, &sl[2], &sr[2], &sl[5],
+ &sr[5]);
+
+ // step 5
+ stepl[0] = vaddq_s32(sl[0], sl[1]);
+ stepr[0] = vaddq_s32(sr[0], sr[1]);
+ stepl[1] = vsubq_s32(sl[0], sl[1]);
+ stepr[1] = vsubq_s32(sr[0], sr[1]);
+ stepl[2] = vaddq_s32(xl[1], sl[2]);
+ stepr[2] = vaddq_s32(xr[1], sr[2]);
+ stepl[3] = vsubq_s32(xl[1], sl[2]);
+ stepr[3] = vsubq_s32(xr[1], sr[2]);
+ stepl[4] = vsubq_s32(xl[2], sl[5]);
+ stepr[4] = vsubq_s32(xr[2], sr[5]);
+ stepl[5] = vaddq_s32(xl[2], sl[5]);
+ stepr[5] = vaddq_s32(xr[2], sr[5]);
+ stepl[6] = vsubq_s32(sl[7], sl[6]);
+ stepr[6] = vsubq_s32(sr[7], sr[6]);
+ stepl[7] = vaddq_s32(sl[7], sl[6]);
+ stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
index de74e6630..d6818d2ec 100644
--- a/libvpx/vpx_dsp/arm/fdct32x32_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -15,6 +15,8 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
// Most gcc 4.9 distributions outside of Android do not generate correct code
// for this function.
@@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
#else
-#define LOAD_INCREMENT(src, stride, dest, index) \
- do { \
- dest[index] = vld1q_s16(src); \
- src += stride; \
- } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3) \
- do { \
- dest[index3] = vaddq_s16(src[index0], src[index1]); \
- } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1) \
- do { \
- src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
- } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
- const int16_t *a_end = a + 24 * stride;
- int16x8_t c[8];
-
- LOAD_INCREMENT(a, stride, b, 0);
- LOAD_INCREMENT(a, stride, b, 1);
- LOAD_INCREMENT(a, stride, b, 2);
- LOAD_INCREMENT(a, stride, b, 3);
- LOAD_INCREMENT(a, stride, b, 4);
- LOAD_INCREMENT(a, stride, b, 5);
- LOAD_INCREMENT(a, stride, b, 6);
- LOAD_INCREMENT(a, stride, b, 7);
-
- LOAD_INCREMENT(a_end, stride, b, 24);
- LOAD_INCREMENT(a_end, stride, b, 25);
- LOAD_INCREMENT(a_end, stride, b, 26);
- LOAD_INCREMENT(a_end, stride, b, 27);
- LOAD_INCREMENT(a_end, stride, b, 28);
- LOAD_INCREMENT(a_end, stride, b, 29);
- LOAD_INCREMENT(a_end, stride, b, 30);
- LOAD_INCREMENT(a_end, stride, b, 31);
-
- ADD_S16(b, 0, 31, c, 0);
- ADD_S16(b, 1, 30, c, 1);
- ADD_S16(b, 2, 29, c, 2);
- ADD_S16(b, 3, 28, c, 3);
- ADD_S16(b, 4, 27, c, 4);
- ADD_S16(b, 5, 26, c, 5);
- ADD_S16(b, 6, 25, c, 6);
- ADD_S16(b, 7, 24, c, 7);
-
- ADD_SHIFT_S16(b, 7, 24);
- ADD_SHIFT_S16(b, 6, 25);
- ADD_SHIFT_S16(b, 5, 26);
- ADD_SHIFT_S16(b, 4, 27);
- ADD_SHIFT_S16(b, 3, 28);
- ADD_SHIFT_S16(b, 2, 29);
- ADD_SHIFT_S16(b, 1, 30);
- ADD_SHIFT_S16(b, 0, 31);
-
- b[0] = vshlq_n_s16(c[0], 2);
- b[1] = vshlq_n_s16(c[1], 2);
- b[2] = vshlq_n_s16(c[2], 2);
- b[3] = vshlq_n_s16(c[3], 2);
- b[4] = vshlq_n_s16(c[4], 2);
- b[5] = vshlq_n_s16(c[5], 2);
- b[6] = vshlq_n_s16(c[6], 2);
- b[7] = vshlq_n_s16(c[7], 2);
-
- LOAD_INCREMENT(a, stride, b, 8);
- LOAD_INCREMENT(a, stride, b, 9);
- LOAD_INCREMENT(a, stride, b, 10);
- LOAD_INCREMENT(a, stride, b, 11);
- LOAD_INCREMENT(a, stride, b, 12);
- LOAD_INCREMENT(a, stride, b, 13);
- LOAD_INCREMENT(a, stride, b, 14);
- LOAD_INCREMENT(a, stride, b, 15);
- LOAD_INCREMENT(a, stride, b, 16);
- LOAD_INCREMENT(a, stride, b, 17);
- LOAD_INCREMENT(a, stride, b, 18);
- LOAD_INCREMENT(a, stride, b, 19);
- LOAD_INCREMENT(a, stride, b, 20);
- LOAD_INCREMENT(a, stride, b, 21);
- LOAD_INCREMENT(a, stride, b, 22);
- LOAD_INCREMENT(a, stride, b, 23);
-
- ADD_S16(b, 8, 23, c, 0);
- ADD_S16(b, 9, 22, c, 1);
- ADD_S16(b, 10, 21, c, 2);
- ADD_S16(b, 11, 20, c, 3);
- ADD_S16(b, 12, 19, c, 4);
- ADD_S16(b, 13, 18, c, 5);
- ADD_S16(b, 14, 17, c, 6);
- ADD_S16(b, 15, 16, c, 7);
-
- ADD_SHIFT_S16(b, 15, 16);
- ADD_SHIFT_S16(b, 14, 17);
- ADD_SHIFT_S16(b, 13, 18);
- ADD_SHIFT_S16(b, 12, 19);
- ADD_SHIFT_S16(b, 11, 20);
- ADD_SHIFT_S16(b, 10, 21);
- ADD_SHIFT_S16(b, 9, 22);
- ADD_SHIFT_S16(b, 8, 23);
-
- b[8] = vshlq_n_s16(c[0], 2);
- b[9] = vshlq_n_s16(c[1], 2);
- b[10] = vshlq_n_s16(c[2], 2);
- b[11] = vshlq_n_s16(c[3], 2);
- b[12] = vshlq_n_s16(c[4], 2);
- b[13] = vshlq_n_s16(c[5], 2);
- b[14] = vshlq_n_s16(c[6], 2);
- b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest) \
- do { \
- store_s16q_to_tran_low(dest, src[index]); \
- dest += 8; \
- } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
- STORE_S16(b, 0, a);
- STORE_S16(b, 8, a);
- STORE_S16(b, 16, a);
- STORE_S16(b, 24, a);
- STORE_S16(b, 1, a);
- STORE_S16(b, 9, a);
- STORE_S16(b, 17, a);
- STORE_S16(b, 25, a);
- STORE_S16(b, 2, a);
- STORE_S16(b, 10, a);
- STORE_S16(b, 18, a);
- STORE_S16(b, 26, a);
- STORE_S16(b, 3, a);
- STORE_S16(b, 11, a);
- STORE_S16(b, 19, a);
- STORE_S16(b, 27, a);
- STORE_S16(b, 4, a);
- STORE_S16(b, 12, a);
- STORE_S16(b, 20, a);
- STORE_S16(b, 28, a);
- STORE_S16(b, 5, a);
- STORE_S16(b, 13, a);
- STORE_S16(b, 21, a);
- STORE_S16(b, 29, a);
- STORE_S16(b, 6, a);
- STORE_S16(b, 14, a);
- STORE_S16(b, 22, a);
- STORE_S16(b, 30, a);
- STORE_S16(b, 7, a);
- STORE_S16(b, 15, a);
- STORE_S16(b, 23, a);
- STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
- const tran_high_t constant,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
- const tran_coef_t constant0,
- const tran_coef_t constant1,
- int16x8_t *add, int16x8_t *sub) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
- const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
- const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
- const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
- const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
- const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
- const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
- const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
- const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
- *add = vcombine_s16(rounded0, rounded1);
- *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
- const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
- const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
- const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
- return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
-}
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1: Done as part of the load.
-
- // Stage 2.
- // Mini cross. X the first 16 values and the middle 8 of the second half.
- a[0] = vaddq_s16(in[0], in[15]);
- a[1] = vaddq_s16(in[1], in[14]);
- a[2] = vaddq_s16(in[2], in[13]);
- a[3] = vaddq_s16(in[3], in[12]);
- a[4] = vaddq_s16(in[4], in[11]);
- a[5] = vaddq_s16(in[5], in[10]);
- a[6] = vaddq_s16(in[6], in[9]);
- a[7] = vaddq_s16(in[7], in[8]);
-
- a[8] = vsubq_s16(in[7], in[8]);
- a[9] = vsubq_s16(in[6], in[9]);
- a[10] = vsubq_s16(in[5], in[10]);
- a[11] = vsubq_s16(in[4], in[11]);
- a[12] = vsubq_s16(in[3], in[12]);
- a[13] = vsubq_s16(in[2], in[13]);
- a[14] = vsubq_s16(in[1], in[14]);
- a[15] = vsubq_s16(in[0], in[15]);
-
- a[16] = in[16];
- a[17] = in[17];
- a[18] = in[18];
- a[19] = in[19];
-
- butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
- butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
- butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
- butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
- a[28] = in[28];
- a[29] = in[29];
- a[30] = in[30];
- a[31] = in[31];
-
- // Stage 3.
- b[0] = vaddq_s16(a[0], a[7]);
- b[1] = vaddq_s16(a[1], a[6]);
- b[2] = vaddq_s16(a[2], a[5]);
- b[3] = vaddq_s16(a[3], a[4]);
-
- b[4] = vsubq_s16(a[3], a[4]);
- b[5] = vsubq_s16(a[2], a[5]);
- b[6] = vsubq_s16(a[1], a[6]);
- b[7] = vsubq_s16(a[0], a[7]);
-
- b[8] = a[8];
- b[9] = a[9];
-
- butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
- butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
- b[14] = a[14];
- b[15] = a[15];
-
- b[16] = vaddq_s16(in[16], a[23]);
- b[17] = vaddq_s16(in[17], a[22]);
- b[18] = vaddq_s16(in[18], a[21]);
- b[19] = vaddq_s16(in[19], a[20]);
-
- b[20] = vsubq_s16(in[19], a[20]);
- b[21] = vsubq_s16(in[18], a[21]);
- b[22] = vsubq_s16(in[17], a[22]);
- b[23] = vsubq_s16(in[16], a[23]);
-
- b[24] = vsubq_s16(in[31], a[24]);
- b[25] = vsubq_s16(in[30], a[25]);
- b[26] = vsubq_s16(in[29], a[26]);
- b[27] = vsubq_s16(in[28], a[27]);
-
- b[28] = vaddq_s16(in[28], a[27]);
- b[29] = vaddq_s16(in[29], a[26]);
- b[30] = vaddq_s16(in[30], a[25]);
- b[31] = vaddq_s16(in[31], a[24]);
-
- // Stage 4.
- a[0] = vaddq_s16(b[0], b[3]);
- a[1] = vaddq_s16(b[1], b[2]);
- a[2] = vsubq_s16(b[1], b[2]);
- a[3] = vsubq_s16(b[0], b[3]);
-
- a[4] = b[4];
-
- butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
- a[7] = b[7];
-
- a[8] = vaddq_s16(b[8], b[11]);
- a[9] = vaddq_s16(b[9], b[10]);
- a[10] = vsubq_s16(b[9], b[10]);
- a[11] = vsubq_s16(b[8], b[11]);
- a[12] = vsubq_s16(b[15], b[12]);
- a[13] = vsubq_s16(b[14], b[13]);
- a[14] = vaddq_s16(b[14], b[13]);
- a[15] = vaddq_s16(b[15], b[12]);
-
- a[16] = b[16];
- a[17] = b[17];
-
- butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
- butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
- butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
- butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
- a[22] = b[22];
- a[23] = b[23];
- a[24] = b[24];
- a[25] = b[25];
-
- a[30] = b[30];
- a[31] = b[31];
-
- // Stage 5.
- butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
- butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
- b[4] = vaddq_s16(a[4], a[5]);
- b[5] = vsubq_s16(a[4], a[5]);
- b[6] = vsubq_s16(a[7], a[6]);
- b[7] = vaddq_s16(a[7], a[6]);
-
- b[8] = a[8];
-
- butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
- butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
- b[11] = a[11];
- b[12] = a[12];
-
- b[15] = a[15];
-
- b[16] = vaddq_s16(a[19], a[16]);
- b[17] = vaddq_s16(a[18], a[17]);
- b[18] = vsubq_s16(a[17], a[18]);
- b[19] = vsubq_s16(a[16], a[19]);
- b[20] = vsubq_s16(a[23], a[20]);
- b[21] = vsubq_s16(a[22], a[21]);
- b[22] = vaddq_s16(a[21], a[22]);
- b[23] = vaddq_s16(a[20], a[23]);
- b[24] = vaddq_s16(a[27], a[24]);
- b[25] = vaddq_s16(a[26], a[25]);
- b[26] = vsubq_s16(a[25], a[26]);
- b[27] = vsubq_s16(a[24], a[27]);
- b[28] = vsubq_s16(a[31], a[28]);
- b[29] = vsubq_s16(a[30], a[29]);
- b[30] = vaddq_s16(a[29], a[30]);
- b[31] = vaddq_s16(a[28], a[31]);
-
- // Stage 6.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
-
- butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
- butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
- a[8] = vaddq_s16(b[8], b[9]);
- a[9] = vsubq_s16(b[8], b[9]);
- a[10] = vsubq_s16(b[11], b[10]);
- a[11] = vaddq_s16(b[11], b[10]);
- a[12] = vaddq_s16(b[12], b[13]);
- a[13] = vsubq_s16(b[12], b[13]);
- a[14] = vsubq_s16(b[15], b[14]);
- a[15] = vaddq_s16(b[15], b[14]);
-
- a[16] = b[16];
- a[19] = b[19];
- a[20] = b[20];
- a[23] = b[23];
- a[24] = b[24];
- a[27] = b[27];
- a[28] = b[28];
- a[31] = b[31];
-
- butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
- butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
- butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
- butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
- // Stage 7.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
- b[4] = a[4];
- b[5] = a[5];
- b[6] = a[6];
- b[7] = a[7];
-
- butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
- butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
- butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
- butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
- b[16] = vaddq_s16(a[16], a[17]);
- b[17] = vsubq_s16(a[16], a[17]);
- b[18] = vsubq_s16(a[19], a[18]);
- b[19] = vaddq_s16(a[19], a[18]);
- b[20] = vaddq_s16(a[20], a[21]);
- b[21] = vsubq_s16(a[20], a[21]);
- b[22] = vsubq_s16(a[23], a[22]);
- b[23] = vaddq_s16(a[23], a[22]);
- b[24] = vaddq_s16(a[24], a[25]);
- b[25] = vsubq_s16(a[24], a[25]);
- b[26] = vsubq_s16(a[27], a[26]);
- b[27] = vaddq_s16(a[27], a[26]);
- b[28] = vaddq_s16(a[28], a[29]);
- b[29] = vsubq_s16(a[28], a[29]);
- b[30] = vsubq_s16(a[31], a[30]);
- b[31] = vaddq_s16(a[31], a[30]);
-
- // Final stage.
- // Also compute partial rounding shift:
- // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- out[0] = sub_round_shift(b[0]);
- out[16] = sub_round_shift(b[1]);
- out[8] = sub_round_shift(b[2]);
- out[24] = sub_round_shift(b[3]);
- out[4] = sub_round_shift(b[4]);
- out[20] = sub_round_shift(b[5]);
- out[12] = sub_round_shift(b[6]);
- out[28] = sub_round_shift(b[7]);
- out[2] = sub_round_shift(b[8]);
- out[18] = sub_round_shift(b[9]);
- out[10] = sub_round_shift(b[10]);
- out[26] = sub_round_shift(b[11]);
- out[6] = sub_round_shift(b[12]);
- out[22] = sub_round_shift(b[13]);
- out[14] = sub_round_shift(b[14]);
- out[30] = sub_round_shift(b[15]);
-
- butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
- out[1] = sub_round_shift(a[1]);
- out[31] = sub_round_shift(a[31]);
-
- butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
- out[17] = sub_round_shift(a[17]);
- out[15] = sub_round_shift(a[15]);
-
- butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
- out[9] = sub_round_shift(a[9]);
- out[23] = sub_round_shift(a[23]);
-
- butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
- out[25] = sub_round_shift(a[25]);
- out[7] = sub_round_shift(a[7]);
-
- butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
- out[5] = sub_round_shift(a[5]);
- out[27] = sub_round_shift(a[27]);
-
- butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
- out[21] = sub_round_shift(a[21]);
- out[11] = sub_round_shift(a[11]);
-
- butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
- out[13] = sub_round_shift(a[13]);
- out[19] = sub_round_shift(a[19]);
-
- butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
- out[29] = sub_round_shift(a[29]);
- out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element) \
- do { \
- dst##_lo[element] = src##_lo[element]; \
- dst##_hi[element] = src##_hi[element]; \
- } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = \
- vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
- b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
- vget_high_s16(a[right_index])); \
- } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
- do { \
- c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
- c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
- } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
- do { \
- temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
- temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
- c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
- c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
- } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index) \
- do { \
- b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
- b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
- } while (0)
-
-// Like butterfly_one_coeff, but don't narrow results.
-static INLINE void butterfly_one_coeff_s16_s32(
- const int16x8_t a, const int16x8_t b, const tran_high_t constant,
- int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
- int32x4_t *sub_hi) {
- const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
- const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
- const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
- const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
- const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
- add_index, sub_index) \
- do { \
- butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
- &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
- const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
- int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
- const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
- const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
- const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
- const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
- const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
- sub_index) \
- do { \
- butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- constant, &b##_lo[add_index], &b##_hi[add_index], \
- &b##_lo[sub_index], &b##_hi[sub_index]); \
- } while (0)
-
-// Like butterfly_two_coeff, but with s32.
-static INLINE void butterfly_two_coeff_s32(
- const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
- const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
- int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
- int32x4_t *sub_hi) {
- const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
- const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
- const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
- const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
- const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
- const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
- const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
- const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
- *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
- *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
- *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
- *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
- right_constant, b, add_index, sub_index) \
- do { \
- butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
- a##_lo[right_index], a##_hi[right_index], \
- left_constant, right_constant, &b##_lo[add_index], \
- &b##_hi[add_index], &b##_lo[sub_index], \
- &b##_hi[sub_index]); \
- } while (0)
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
- const int32x4_t a_hi) {
- const int32x4_t one = vdupq_n_s32(1);
- const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
- const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
- const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
- const int16x4_t b_lo =
- vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
- const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
- const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
- const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
- const int16x4_t b_hi =
- vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
- return vcombine_s16(b_lo, b_hi);
-}
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
- int32x4_t c_lo[32];
- int32x4_t c_hi[32];
- int32x4_t d_lo[32];
- int32x4_t d_hi[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- b[0] = vaddq_s16(a[0], a[15]);
- b[1] = vaddq_s16(a[1], a[14]);
- b[2] = vaddq_s16(a[2], a[13]);
- b[3] = vaddq_s16(a[3], a[12]);
- b[4] = vaddq_s16(a[4], a[11]);
- b[5] = vaddq_s16(a[5], a[10]);
- b[6] = vaddq_s16(a[6], a[9]);
- b[7] = vaddq_s16(a[7], a[8]);
-
- b[8] = vsubq_s16(a[7], a[8]);
- b[9] = vsubq_s16(a[6], a[9]);
- b[10] = vsubq_s16(a[5], a[10]);
- b[11] = vsubq_s16(a[4], a[11]);
- b[12] = vsubq_s16(a[3], a[12]);
- b[13] = vsubq_s16(a[2], a[13]);
- b[14] = vsubq_s16(a[1], a[14]);
- b[15] = vsubq_s16(a[0], a[15]);
-
- b[16] = a[16];
- b[17] = a[17];
- b[18] = a[18];
- b[19] = a[19];
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
- b[28] = a[28];
- b[29] = a[29];
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 3. With extreme values for input this calculation rolls over int16_t.
- // The sources for b[0] get added multiple times and, through testing, have
- // been shown to overflow starting here.
- ADD_S16_S32(b, 0, 7, c, 0);
- ADD_S16_S32(b, 1, 6, c, 1);
- ADD_S16_S32(b, 2, 5, c, 2);
- ADD_S16_S32(b, 3, 4, c, 3);
- SUB_S16_S32(b, 3, 4, c, 4);
- SUB_S16_S32(b, 2, 5, c, 5);
- SUB_S16_S32(b, 1, 6, c, 6);
- SUB_S16_S32(b, 0, 7, c, 7);
-
- a[8] = b[8];
- a[9] = b[9];
-
- BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
- BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
- a[14] = b[14];
- a[15] = b[15];
-
- ADD_S16_S32(b, 16, 23, c, 16);
- ADD_S16_S32(b, 17, 22, c, 17);
- ADD_S16_S32(b, 18, 21, c, 18);
- ADD_S16_S32(b, 19, 20, c, 19);
- SUB_S16_S32(b, 19, 20, c, 20);
- SUB_S16_S32(b, 18, 21, c, 21);
- SUB_S16_S32(b, 17, 22, c, 22);
- SUB_S16_S32(b, 16, 23, c, 23);
- SUB_S16_S32(b, 31, 24, c, 24);
- SUB_S16_S32(b, 30, 25, c, 25);
- SUB_S16_S32(b, 29, 26, c, 26);
- SUB_S16_S32(b, 28, 27, c, 27);
- ADD_S16_S32(b, 28, 27, c, 28);
- ADD_S16_S32(b, 29, 26, c, 29);
- ADD_S16_S32(b, 30, 25, c, 30);
- ADD_S16_S32(b, 31, 24, c, 31);
-
- // Stage 4.
- ADD_S32(c, 0, 3, d, 0);
- ADD_S32(c, 1, 2, d, 1);
- SUB_S32(c, 1, 2, d, 2);
- SUB_S32(c, 0, 3, d, 3);
-
- PASS_THROUGH(c, d, 4);
-
- BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
- PASS_THROUGH(c, d, 7);
-
- ADDW_S16_S32(c, 11, a, 8, d, 8);
- ADDW_S16_S32(c, 10, a, 9, d, 9);
- SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
- SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
- SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
- SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
- ADDW_S16_S32(c, 13, b, 14, d, 14);
- ADDW_S16_S32(c, 12, b, 15, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 17);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
- BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
- BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
- PASS_THROUGH(c, d, 22);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 25);
-
- PASS_THROUGH(c, d, 30);
- PASS_THROUGH(c, d, 31);
-
- // Stage 5.
- BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
- BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
- ADD_S32(d, 4, 5, c, 4);
- SUB_S32(d, 4, 5, c, 5);
- SUB_S32(d, 7, 6, c, 6);
- ADD_S32(d, 7, 6, c, 7);
-
- PASS_THROUGH(d, c, 8);
-
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
- BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
- PASS_THROUGH(d, c, 11);
- PASS_THROUGH(d, c, 12);
- PASS_THROUGH(d, c, 15);
-
- ADD_S32(d, 16, 19, c, 16);
- ADD_S32(d, 17, 18, c, 17);
- SUB_S32(d, 17, 18, c, 18);
- SUB_S32(d, 16, 19, c, 19);
- SUB_S32(d, 23, 20, c, 20);
- SUB_S32(d, 22, 21, c, 21);
- ADD_S32(d, 22, 21, c, 22);
- ADD_S32(d, 23, 20, c, 23);
- ADD_S32(d, 24, 27, c, 24);
- ADD_S32(d, 25, 26, c, 25);
- SUB_S32(d, 25, 26, c, 26);
- SUB_S32(d, 24, 27, c, 27);
- SUB_S32(d, 31, 28, c, 28);
- SUB_S32(d, 30, 29, c, 29);
- ADD_S32(d, 30, 29, c, 30);
- ADD_S32(d, 31, 28, c, 31);
-
- // Stage 6.
- PASS_THROUGH(c, d, 0);
- PASS_THROUGH(c, d, 1);
- PASS_THROUGH(c, d, 2);
- PASS_THROUGH(c, d, 3);
-
- BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
- BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
- ADD_S32(c, 8, 9, d, 8);
- SUB_S32(c, 8, 9, d, 9);
- SUB_S32(c, 11, 10, d, 10);
- ADD_S32(c, 11, 10, d, 11);
- ADD_S32(c, 12, 13, d, 12);
- SUB_S32(c, 12, 13, d, 13);
- SUB_S32(c, 15, 14, d, 14);
- ADD_S32(c, 15, 14, d, 15);
-
- PASS_THROUGH(c, d, 16);
- PASS_THROUGH(c, d, 19);
- PASS_THROUGH(c, d, 20);
- PASS_THROUGH(c, d, 23);
- PASS_THROUGH(c, d, 24);
- PASS_THROUGH(c, d, 27);
- PASS_THROUGH(c, d, 28);
- PASS_THROUGH(c, d, 31);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
- BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
- BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
- // Stage 7.
- PASS_THROUGH(d, c, 0);
- PASS_THROUGH(d, c, 1);
- PASS_THROUGH(d, c, 2);
- PASS_THROUGH(d, c, 3);
- PASS_THROUGH(d, c, 4);
- PASS_THROUGH(d, c, 5);
- PASS_THROUGH(d, c, 6);
- PASS_THROUGH(d, c, 7);
-
- BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
- BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
- BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
- BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
- ADD_S32(d, 16, 17, c, 16);
- SUB_S32(d, 16, 17, c, 17);
- SUB_S32(d, 19, 18, c, 18);
- ADD_S32(d, 19, 18, c, 19);
- ADD_S32(d, 20, 21, c, 20);
- SUB_S32(d, 20, 21, c, 21);
- SUB_S32(d, 23, 22, c, 22);
- ADD_S32(d, 23, 22, c, 23);
- ADD_S32(d, 24, 25, c, 24);
- SUB_S32(d, 24, 25, c, 25);
- SUB_S32(d, 27, 26, c, 26);
- ADD_S32(d, 27, 26, c, 27);
- ADD_S32(d, 28, 29, c, 28);
- SUB_S32(d, 28, 29, c, 29);
- SUB_S32(d, 31, 30, c, 30);
- ADD_S32(d, 31, 30, c, 31);
-
- // Final stage.
- // Roll rounding into this function so we can pass back int16x8.
-
- out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
- out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
- out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
- out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
- out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
- out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
- out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
- out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
- out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
- out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
- out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
- out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
- out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
- out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
- out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
- out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
- out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
- out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
- BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
- out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
- out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
- BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
- out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
- out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
- BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
- out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
- out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
- BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
- out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
- out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
- BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
- out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
- out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
- BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
- out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
- out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
- BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
- out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
- out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
- const int16x8_t one = vdupq_n_s16(1);
- const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
- const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
- const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
- return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
- int16x8_t a[32];
- int16x8_t b[32];
-
- // Stage 1. Done as part of the load for the first pass.
- a[0] = vaddq_s16(in[0], in[31]);
- a[1] = vaddq_s16(in[1], in[30]);
- a[2] = vaddq_s16(in[2], in[29]);
- a[3] = vaddq_s16(in[3], in[28]);
- a[4] = vaddq_s16(in[4], in[27]);
- a[5] = vaddq_s16(in[5], in[26]);
- a[6] = vaddq_s16(in[6], in[25]);
- a[7] = vaddq_s16(in[7], in[24]);
- a[8] = vaddq_s16(in[8], in[23]);
- a[9] = vaddq_s16(in[9], in[22]);
- a[10] = vaddq_s16(in[10], in[21]);
- a[11] = vaddq_s16(in[11], in[20]);
- a[12] = vaddq_s16(in[12], in[19]);
- a[13] = vaddq_s16(in[13], in[18]);
- a[14] = vaddq_s16(in[14], in[17]);
- a[15] = vaddq_s16(in[15], in[16]);
- a[16] = vsubq_s16(in[15], in[16]);
- a[17] = vsubq_s16(in[14], in[17]);
- a[18] = vsubq_s16(in[13], in[18]);
- a[19] = vsubq_s16(in[12], in[19]);
- a[20] = vsubq_s16(in[11], in[20]);
- a[21] = vsubq_s16(in[10], in[21]);
- a[22] = vsubq_s16(in[9], in[22]);
- a[23] = vsubq_s16(in[8], in[23]);
- a[24] = vsubq_s16(in[7], in[24]);
- a[25] = vsubq_s16(in[6], in[25]);
- a[26] = vsubq_s16(in[5], in[26]);
- a[27] = vsubq_s16(in[4], in[27]);
- a[28] = vsubq_s16(in[3], in[28]);
- a[29] = vsubq_s16(in[2], in[29]);
- a[30] = vsubq_s16(in[1], in[30]);
- a[31] = vsubq_s16(in[0], in[31]);
-
- // Stage 2.
- // For the "rd" version, all the values are rounded down after stage 2 to keep
- // the values in 16 bits.
- b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
- b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
- b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
- b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
- b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
- b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
- b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
- b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
- b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
- b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
- b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
- b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
- b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
- b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
- b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
- b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
- b[16] = add_round_shift_s16(a[16]);
- b[17] = add_round_shift_s16(a[17]);
- b[18] = add_round_shift_s16(a[18]);
- b[19] = add_round_shift_s16(a[19]);
-
- butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
- butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
- butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
- butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
- b[20] = add_round_shift_s16(b[20]);
- b[21] = add_round_shift_s16(b[21]);
- b[22] = add_round_shift_s16(b[22]);
- b[23] = add_round_shift_s16(b[23]);
- b[24] = add_round_shift_s16(b[24]);
- b[25] = add_round_shift_s16(b[25]);
- b[26] = add_round_shift_s16(b[26]);
- b[27] = add_round_shift_s16(b[27]);
-
- b[28] = add_round_shift_s16(a[28]);
- b[29] = add_round_shift_s16(a[29]);
- b[30] = add_round_shift_s16(a[30]);
- b[31] = add_round_shift_s16(a[31]);
-
- // Stage 3.
- a[0] = vaddq_s16(b[0], b[7]);
- a[1] = vaddq_s16(b[1], b[6]);
- a[2] = vaddq_s16(b[2], b[5]);
- a[3] = vaddq_s16(b[3], b[4]);
-
- a[4] = vsubq_s16(b[3], b[4]);
- a[5] = vsubq_s16(b[2], b[5]);
- a[6] = vsubq_s16(b[1], b[6]);
- a[7] = vsubq_s16(b[0], b[7]);
-
- a[8] = b[8];
- a[9] = b[9];
-
- butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
- butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
- a[14] = b[14];
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[16], b[23]);
- a[17] = vaddq_s16(b[17], b[22]);
- a[18] = vaddq_s16(b[18], b[21]);
- a[19] = vaddq_s16(b[19], b[20]);
-
- a[20] = vsubq_s16(b[19], b[20]);
- a[21] = vsubq_s16(b[18], b[21]);
- a[22] = vsubq_s16(b[17], b[22]);
- a[23] = vsubq_s16(b[16], b[23]);
-
- a[24] = vsubq_s16(b[31], b[24]);
- a[25] = vsubq_s16(b[30], b[25]);
- a[26] = vsubq_s16(b[29], b[26]);
- a[27] = vsubq_s16(b[28], b[27]);
-
- a[28] = vaddq_s16(b[28], b[27]);
- a[29] = vaddq_s16(b[29], b[26]);
- a[30] = vaddq_s16(b[30], b[25]);
- a[31] = vaddq_s16(b[31], b[24]);
-
- // Stage 4.
- b[0] = vaddq_s16(a[0], a[3]);
- b[1] = vaddq_s16(a[1], a[2]);
- b[2] = vsubq_s16(a[1], a[2]);
- b[3] = vsubq_s16(a[0], a[3]);
-
- b[4] = a[4];
-
- butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
- b[7] = a[7];
-
- b[8] = vaddq_s16(a[8], a[11]);
- b[9] = vaddq_s16(a[9], a[10]);
- b[10] = vsubq_s16(a[9], a[10]);
- b[11] = vsubq_s16(a[8], a[11]);
- b[12] = vsubq_s16(a[15], a[12]);
- b[13] = vsubq_s16(a[14], a[13]);
- b[14] = vaddq_s16(a[14], a[13]);
- b[15] = vaddq_s16(a[15], a[12]);
-
- b[16] = a[16];
- b[17] = a[17];
-
- butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
- butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
- butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
- butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
- b[22] = a[22];
- b[23] = a[23];
- b[24] = a[24];
- b[25] = a[25];
-
- b[30] = a[30];
- b[31] = a[31];
-
- // Stage 5.
- butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
- butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
- a[4] = vaddq_s16(b[4], b[5]);
- a[5] = vsubq_s16(b[4], b[5]);
- a[6] = vsubq_s16(b[7], b[6]);
- a[7] = vaddq_s16(b[7], b[6]);
-
- a[8] = b[8];
-
- butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
- butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
- a[11] = b[11];
- a[12] = b[12];
-
- a[15] = b[15];
-
- a[16] = vaddq_s16(b[19], b[16]);
- a[17] = vaddq_s16(b[18], b[17]);
- a[18] = vsubq_s16(b[17], b[18]);
- a[19] = vsubq_s16(b[16], b[19]);
- a[20] = vsubq_s16(b[23], b[20]);
- a[21] = vsubq_s16(b[22], b[21]);
- a[22] = vaddq_s16(b[21], b[22]);
- a[23] = vaddq_s16(b[20], b[23]);
- a[24] = vaddq_s16(b[27], b[24]);
- a[25] = vaddq_s16(b[26], b[25]);
- a[26] = vsubq_s16(b[25], b[26]);
- a[27] = vsubq_s16(b[24], b[27]);
- a[28] = vsubq_s16(b[31], b[28]);
- a[29] = vsubq_s16(b[30], b[29]);
- a[30] = vaddq_s16(b[29], b[30]);
- a[31] = vaddq_s16(b[28], b[31]);
-
- // Stage 6.
- b[0] = a[0];
- b[1] = a[1];
- b[2] = a[2];
- b[3] = a[3];
-
- butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
- butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
- b[8] = vaddq_s16(a[8], a[9]);
- b[9] = vsubq_s16(a[8], a[9]);
- b[10] = vsubq_s16(a[11], a[10]);
- b[11] = vaddq_s16(a[11], a[10]);
- b[12] = vaddq_s16(a[12], a[13]);
- b[13] = vsubq_s16(a[12], a[13]);
- b[14] = vsubq_s16(a[15], a[14]);
- b[15] = vaddq_s16(a[15], a[14]);
-
- b[16] = a[16];
- b[19] = a[19];
- b[20] = a[20];
- b[23] = a[23];
- b[24] = a[24];
- b[27] = a[27];
- b[28] = a[28];
- b[31] = a[31];
-
- butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
- butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
- butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
- butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
- // Stage 7.
- a[0] = b[0];
- a[1] = b[1];
- a[2] = b[2];
- a[3] = b[3];
- a[4] = b[4];
- a[5] = b[5];
- a[6] = b[6];
- a[7] = b[7];
-
- butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
- butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
- butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
- butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
- a[16] = vaddq_s16(b[16], b[17]);
- a[17] = vsubq_s16(b[16], b[17]);
- a[18] = vsubq_s16(b[19], b[18]);
- a[19] = vaddq_s16(b[19], b[18]);
- a[20] = vaddq_s16(b[20], b[21]);
- a[21] = vsubq_s16(b[20], b[21]);
- a[22] = vsubq_s16(b[23], b[22]);
- a[23] = vaddq_s16(b[23], b[22]);
- a[24] = vaddq_s16(b[24], b[25]);
- a[25] = vsubq_s16(b[24], b[25]);
- a[26] = vsubq_s16(b[27], b[26]);
- a[27] = vaddq_s16(b[27], b[26]);
- a[28] = vaddq_s16(b[28], b[29]);
- a[29] = vsubq_s16(b[28], b[29]);
- a[30] = vsubq_s16(b[31], b[30]);
- a[31] = vaddq_s16(b[31], b[30]);
-
- // Final stage.
- out[0] = a[0];
- out[16] = a[1];
- out[8] = a[2];
- out[24] = a[3];
- out[4] = a[4];
- out[20] = a[5];
- out[12] = a[6];
- out[28] = a[7];
- out[2] = a[8];
- out[18] = a[9];
- out[10] = a[10];
- out[26] = a[11];
- out[6] = a[12];
- out[22] = a[13];
- out[14] = a[14];
- out[30] = a[15];
-
- butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
- butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
- &out[15]);
- butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
- butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
- butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
- butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
- &out[11]);
- butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
- &out[19]);
- butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-// TODO(johannkoenig): share with other fdcts.
-static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
-}
-
void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp0[32];
int16x8_t temp1[32];
@@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_8x8(&temp1[0], &temp0[0]);
- transpose_8x8(&temp2[0], &temp0[8]);
- transpose_8x8(&temp3[0], &temp0[16]);
- transpose_8x8(&temp4[0], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[0], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output, temp5);
// Second row of 8x32.
- transpose_8x8(&temp1[8], &temp0[0]);
- transpose_8x8(&temp2[8], &temp0[8]);
- transpose_8x8(&temp3[8], &temp0[16]);
- transpose_8x8(&temp4[8], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[8], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_8x8(&temp1[16], &temp0[0]);
- transpose_8x8(&temp2[16], &temp0[8]);
- transpose_8x8(&temp3[16], &temp0[16]);
- transpose_8x8(&temp4[16], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[16], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_8x8(&temp1[24], &temp0[0]);
- transpose_8x8(&temp2[24], &temp0[8]);
- transpose_8x8(&temp3[24], &temp0[16]);
- transpose_8x8(&temp4[24], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[24], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
int16x8_t temp5[32];
// Process in 8x32 columns.
- load(input, stride, temp0);
- dct_body_first_pass(temp0, temp1);
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
- load(input + 8, stride, temp0);
- dct_body_first_pass(temp0, temp2);
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
- load(input + 16, stride, temp0);
- dct_body_first_pass(temp0, temp3);
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
- load(input + 24, stride, temp0);
- dct_body_first_pass(temp0, temp4);
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_8x8(&temp1[0], &temp0[0]);
- transpose_8x8(&temp2[0], &temp0[8]);
- transpose_8x8(&temp3[0], &temp0[16]);
- transpose_8x8(&temp4[0], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[0], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output, temp5);
// Second row of 8x32.
- transpose_8x8(&temp1[8], &temp0[0]);
- transpose_8x8(&temp2[8], &temp0[8]);
- transpose_8x8(&temp3[8], &temp0[16]);
- transpose_8x8(&temp4[8], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[8], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_8x8(&temp1[16], &temp0[0]);
- transpose_8x8(&temp2[16], &temp0[8]);
- transpose_8x8(&temp3[16], &temp0[16]);
- transpose_8x8(&temp4[16], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[16], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_8x8(&temp1[24], &temp0[0]);
- transpose_8x8(&temp2[24], &temp0[8]);
- transpose_8x8(&temp3[24], &temp0[16]);
- transpose_8x8(&temp4[24], &temp0[24]);
+ transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+ transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+ transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+ transpose_s16_8x8_new(&temp4[24], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
&temp5[29], &temp5[30], &temp5[31]);
store(output + 24 * 32, temp5);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass(left5, right5);
+ highbd_partial_add_round_shift(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass(left6, right6);
+ highbd_partial_add_round_shift(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass(left7, right7);
+ highbd_partial_add_round_shift(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass(left8, right8);
+ highbd_partial_add_round_shift(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
// __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/libvpx/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 000000000..3b9e64c6d
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+ b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+ b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+ b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+ b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+ b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+ b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+ int16x8_t *out /*32*/) {
+ out[0] = vshlq_n_s16(in[0], 2);
+ out[1] = vshlq_n_s16(in[1], 2);
+ out[2] = vshlq_n_s16(in[2], 2);
+ out[3] = vshlq_n_s16(in[3], 2);
+ out[4] = vshlq_n_s16(in[4], 2);
+ out[5] = vshlq_n_s16(in[5], 2);
+ out[6] = vshlq_n_s16(in[6], 2);
+ out[7] = vshlq_n_s16(in[7], 2);
+
+ out[8] = vshlq_n_s16(in[8], 2);
+ out[9] = vshlq_n_s16(in[9], 2);
+ out[10] = vshlq_n_s16(in[10], 2);
+ out[11] = vshlq_n_s16(in[11], 2);
+ out[12] = vshlq_n_s16(in[12], 2);
+ out[13] = vshlq_n_s16(in[13], 2);
+ out[14] = vshlq_n_s16(in[14], 2);
+ out[15] = vshlq_n_s16(in[15], 2);
+
+ out[16] = vshlq_n_s16(in[16], 2);
+ out[17] = vshlq_n_s16(in[17], 2);
+ out[18] = vshlq_n_s16(in[18], 2);
+ out[19] = vshlq_n_s16(in[19], 2);
+ out[20] = vshlq_n_s16(in[20], 2);
+ out[21] = vshlq_n_s16(in[21], 2);
+ out[22] = vshlq_n_s16(in[22], 2);
+ out[23] = vshlq_n_s16(in[23], 2);
+
+ out[24] = vshlq_n_s16(in[24], 2);
+ out[25] = vshlq_n_s16(in[25], 2);
+ out[26] = vshlq_n_s16(in[26], 2);
+ out[27] = vshlq_n_s16(in[27], 2);
+ out[28] = vshlq_n_s16(in[28], 2);
+ out[29] = vshlq_n_s16(in[29], 2);
+ out[30] = vshlq_n_s16(in[30], 2);
+ out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+ &a[20]);
+ butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+ &a[21]);
+ butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+ &a[22]);
+ butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+ &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift_s16(b[0]);
+ out[16] = sub_round_shift_s16(b[1]);
+ out[8] = sub_round_shift_s16(b[2]);
+ out[24] = sub_round_shift_s16(b[3]);
+ out[4] = sub_round_shift_s16(b[4]);
+ out[20] = sub_round_shift_s16(b[5]);
+ out[12] = sub_round_shift_s16(b[6]);
+ out[28] = sub_round_shift_s16(b[7]);
+ out[2] = sub_round_shift_s16(b[8]);
+ out[18] = sub_round_shift_s16(b[9]);
+ out[10] = sub_round_shift_s16(b[10]);
+ out[26] = sub_round_shift_s16(b[11]);
+ out[6] = sub_round_shift_s16(b[12]);
+ out[22] = sub_round_shift_s16(b[13]);
+ out[14] = sub_round_shift_s16(b[14]);
+ out[30] = sub_round_shift_s16(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+ out[1] = sub_round_shift_s16(a[1]);
+ out[31] = sub_round_shift_s16(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+ out[17] = sub_round_shift_s16(a[17]);
+ out[15] = sub_round_shift_s16(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+ out[9] = sub_round_shift_s16(a[9]);
+ out[23] = sub_round_shift_s16(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+ out[25] = sub_round_shift_s16(a[25]);
+ out[7] = sub_round_shift_s16(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+ out[5] = sub_round_shift_s16(a[5]);
+ out[27] = sub_round_shift_s16(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+ out[21] = sub_round_shift_s16(a[21]);
+ out[11] = sub_round_shift_s16(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+ out[13] = sub_round_shift_s16(a[13]);
+ out[19] = sub_round_shift_s16(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+ out[29] = sub_round_shift_s16(a[29]);
+ out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32_fast( \
+ a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
+ a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+ out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+ out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+ out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+ out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+ out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+ out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+ out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+ out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+ tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+ const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+ const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+ const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+ int i;
+ for (i = 0; i < 32; i++) {
+ vst1q_s32(a, l1[i]);
+ vst1q_s32(a + 4, r1[i]);
+ vst1q_s32(a + 8, l2[i]);
+ vst1q_s32(a + 12, r2[i]);
+ vst1q_s32(a + 16, l3[i]);
+ vst1q_s32(a + 20, r3[i]);
+ vst1q_s32(a + 24, l4[i]);
+ vst1q_s32(a + 28, r4[i]);
+ a += 32;
+ }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+ int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+ left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+ left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+ left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+ left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+ left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+ left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+ left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+ left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+ left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+ left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+ left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+ left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+ left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+ left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+ left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+ left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+ right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+ right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+ right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+ right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+ right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+ right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+ right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+ right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+ right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+ right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+ right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+ right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+ right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+ right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+ right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+ right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+ int32x4_t *a_right /*[32]*/,
+ int32x4_t *b_left /*[32]*/,
+ int32x4_t *b_right /*[32]*/) {
+ // Stage 1. Done as part of the load for the first pass.
+ b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+ b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+ b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+ b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+ b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+ b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+ b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+ b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+ b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+ b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+ b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+ b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+ b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+ b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+ b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+ b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+ b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+ b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+ b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+ b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+ b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+ b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+ b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+ b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+ b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+ b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+ b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+ b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+ b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+ b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+ b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+ b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+ b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+ b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+ b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+ b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+ b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+ b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+ b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+ b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+ b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+ b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+ b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+ b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+ b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+ b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+ b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+ b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+ b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = add_round_shift_s32(left[0]);
+ left[1] = add_round_shift_s32(left[1]);
+ left[2] = add_round_shift_s32(left[2]);
+ left[3] = add_round_shift_s32(left[3]);
+ left[4] = add_round_shift_s32(left[4]);
+ left[5] = add_round_shift_s32(left[5]);
+ left[6] = add_round_shift_s32(left[6]);
+ left[7] = add_round_shift_s32(left[7]);
+ left[8] = add_round_shift_s32(left[8]);
+ left[9] = add_round_shift_s32(left[9]);
+ left[10] = add_round_shift_s32(left[10]);
+ left[11] = add_round_shift_s32(left[11]);
+ left[12] = add_round_shift_s32(left[12]);
+ left[13] = add_round_shift_s32(left[13]);
+ left[14] = add_round_shift_s32(left[14]);
+ left[15] = add_round_shift_s32(left[15]);
+ left[16] = add_round_shift_s32(left[16]);
+ left[17] = add_round_shift_s32(left[17]);
+ left[18] = add_round_shift_s32(left[18]);
+ left[19] = add_round_shift_s32(left[19]);
+ left[20] = add_round_shift_s32(left[20]);
+ left[21] = add_round_shift_s32(left[21]);
+ left[22] = add_round_shift_s32(left[22]);
+ left[23] = add_round_shift_s32(left[23]);
+ left[24] = add_round_shift_s32(left[24]);
+ left[25] = add_round_shift_s32(left[25]);
+ left[26] = add_round_shift_s32(left[26]);
+ left[27] = add_round_shift_s32(left[27]);
+ left[28] = add_round_shift_s32(left[28]);
+ left[29] = add_round_shift_s32(left[29]);
+ left[30] = add_round_shift_s32(left[30]);
+ left[31] = add_round_shift_s32(left[31]);
+
+ right[0] = add_round_shift_s32(right[0]);
+ right[1] = add_round_shift_s32(right[1]);
+ right[2] = add_round_shift_s32(right[2]);
+ right[3] = add_round_shift_s32(right[3]);
+ right[4] = add_round_shift_s32(right[4]);
+ right[5] = add_round_shift_s32(right[5]);
+ right[6] = add_round_shift_s32(right[6]);
+ right[7] = add_round_shift_s32(right[7]);
+ right[8] = add_round_shift_s32(right[8]);
+ right[9] = add_round_shift_s32(right[9]);
+ right[10] = add_round_shift_s32(right[10]);
+ right[11] = add_round_shift_s32(right[11]);
+ right[12] = add_round_shift_s32(right[12]);
+ right[13] = add_round_shift_s32(right[13]);
+ right[14] = add_round_shift_s32(right[14]);
+ right[15] = add_round_shift_s32(right[15]);
+ right[16] = add_round_shift_s32(right[16]);
+ right[17] = add_round_shift_s32(right[17]);
+ right[18] = add_round_shift_s32(right[18]);
+ right[19] = add_round_shift_s32(right[19]);
+ right[20] = add_round_shift_s32(right[20]);
+ right[21] = add_round_shift_s32(right[21]);
+ right[22] = add_round_shift_s32(right[22]);
+ right[23] = add_round_shift_s32(right[23]);
+ right[24] = add_round_shift_s32(right[24]);
+ right[25] = add_round_shift_s32(right[25]);
+ right[26] = add_round_shift_s32(right[26]);
+ right[27] = add_round_shift_s32(right[27]);
+ right[28] = add_round_shift_s32(right[28]);
+ right[29] = add_round_shift_s32(right[29]);
+ right[30] = add_round_shift_s32(right[30]);
+ right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = sub_round_shift_s32(left[0]);
+ left[1] = sub_round_shift_s32(left[1]);
+ left[2] = sub_round_shift_s32(left[2]);
+ left[3] = sub_round_shift_s32(left[3]);
+ left[4] = sub_round_shift_s32(left[4]);
+ left[5] = sub_round_shift_s32(left[5]);
+ left[6] = sub_round_shift_s32(left[6]);
+ left[7] = sub_round_shift_s32(left[7]);
+ left[8] = sub_round_shift_s32(left[8]);
+ left[9] = sub_round_shift_s32(left[9]);
+ left[10] = sub_round_shift_s32(left[10]);
+ left[11] = sub_round_shift_s32(left[11]);
+ left[12] = sub_round_shift_s32(left[12]);
+ left[13] = sub_round_shift_s32(left[13]);
+ left[14] = sub_round_shift_s32(left[14]);
+ left[15] = sub_round_shift_s32(left[15]);
+ left[16] = sub_round_shift_s32(left[16]);
+ left[17] = sub_round_shift_s32(left[17]);
+ left[18] = sub_round_shift_s32(left[18]);
+ left[19] = sub_round_shift_s32(left[19]);
+ left[20] = sub_round_shift_s32(left[20]);
+ left[21] = sub_round_shift_s32(left[21]);
+ left[22] = sub_round_shift_s32(left[22]);
+ left[23] = sub_round_shift_s32(left[23]);
+ left[24] = sub_round_shift_s32(left[24]);
+ left[25] = sub_round_shift_s32(left[25]);
+ left[26] = sub_round_shift_s32(left[26]);
+ left[27] = sub_round_shift_s32(left[27]);
+ left[28] = sub_round_shift_s32(left[28]);
+ left[29] = sub_round_shift_s32(left[29]);
+ left[30] = sub_round_shift_s32(left[30]);
+ left[31] = sub_round_shift_s32(left[31]);
+
+ right[0] = sub_round_shift_s32(right[0]);
+ right[1] = sub_round_shift_s32(right[1]);
+ right[2] = sub_round_shift_s32(right[2]);
+ right[3] = sub_round_shift_s32(right[3]);
+ right[4] = sub_round_shift_s32(right[4]);
+ right[5] = sub_round_shift_s32(right[5]);
+ right[6] = sub_round_shift_s32(right[6]);
+ right[7] = sub_round_shift_s32(right[7]);
+ right[8] = sub_round_shift_s32(right[8]);
+ right[9] = sub_round_shift_s32(right[9]);
+ right[10] = sub_round_shift_s32(right[10]);
+ right[11] = sub_round_shift_s32(right[11]);
+ right[12] = sub_round_shift_s32(right[12]);
+ right[13] = sub_round_shift_s32(right[13]);
+ right[14] = sub_round_shift_s32(right[14]);
+ right[15] = sub_round_shift_s32(right[15]);
+ right[16] = sub_round_shift_s32(right[16]);
+ right[17] = sub_round_shift_s32(right[17]);
+ right[18] = sub_round_shift_s32(right[18]);
+ right[19] = sub_round_shift_s32(right[19]);
+ right[20] = sub_round_shift_s32(right[20]);
+ right[21] = sub_round_shift_s32(right[21]);
+ right[22] = sub_round_shift_s32(right[22]);
+ right[23] = sub_round_shift_s32(right[23]);
+ right[24] = sub_round_shift_s32(right[24]);
+ right[25] = sub_round_shift_s32(right[25]);
+ right[26] = sub_round_shift_s32(right[26]);
+ right[27] = sub_round_shift_s32(right[27]);
+ right[28] = sub_round_shift_s32(right[28]);
+ right[29] = sub_round_shift_s32(right[29]);
+ right[30] = sub_round_shift_s32(right[30]);
+ right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+ ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+ al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+ ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+ al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+ ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+ al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+ ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+ al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+ ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+ al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+ ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+ al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+ ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+ al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+ ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+ al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+ ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+ al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+ ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+ al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+ ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+ al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+ ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+ al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+ ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+ al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+ ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+ al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+ ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+ al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+ ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+ al[16] = add_round_shift_s32(left[16]);
+ ar[16] = add_round_shift_s32(right[16]);
+ al[17] = add_round_shift_s32(left[17]);
+ ar[17] = add_round_shift_s32(right[17]);
+ al[18] = add_round_shift_s32(left[18]);
+ ar[18] = add_round_shift_s32(right[18]);
+ al[19] = add_round_shift_s32(left[19]);
+ ar[19] = add_round_shift_s32(right[19]);
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[20] = add_round_shift_s32(al[20]);
+ ar[20] = add_round_shift_s32(ar[20]);
+ al[21] = add_round_shift_s32(al[21]);
+ ar[21] = add_round_shift_s32(ar[21]);
+ al[22] = add_round_shift_s32(al[22]);
+ ar[22] = add_round_shift_s32(ar[22]);
+ al[23] = add_round_shift_s32(al[23]);
+ ar[23] = add_round_shift_s32(ar[23]);
+ al[24] = add_round_shift_s32(al[24]);
+ ar[24] = add_round_shift_s32(ar[24]);
+ al[25] = add_round_shift_s32(al[25]);
+ ar[25] = add_round_shift_s32(ar[25]);
+ al[26] = add_round_shift_s32(al[26]);
+ ar[26] = add_round_shift_s32(ar[26]);
+ al[27] = add_round_shift_s32(al[27]);
+ ar[27] = add_round_shift_s32(ar[27]);
+
+ al[28] = add_round_shift_s32(left[28]);
+ ar[28] = add_round_shift_s32(right[28]);
+ al[29] = add_round_shift_s32(left[29]);
+ ar[29] = add_round_shift_s32(right[29]);
+ al[30] = add_round_shift_s32(left[30]);
+ ar[30] = add_round_shift_s32(right[30]);
+ al[31] = add_round_shift_s32(left[31]);
+ ar[31] = add_round_shift_s32(right[31]);
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[16], al[23]);
+ br[16] = vaddq_s32(ar[16], ar[23]);
+ bl[17] = vaddq_s32(al[17], al[22]);
+ br[17] = vaddq_s32(ar[17], ar[22]);
+ bl[18] = vaddq_s32(al[18], al[21]);
+ br[18] = vaddq_s32(ar[18], ar[21]);
+ bl[19] = vaddq_s32(al[19], al[20]);
+ br[19] = vaddq_s32(ar[19], ar[20]);
+
+ bl[20] = vsubq_s32(al[19], al[20]);
+ br[20] = vsubq_s32(ar[19], ar[20]);
+ bl[21] = vsubq_s32(al[18], al[21]);
+ br[21] = vsubq_s32(ar[18], ar[21]);
+ bl[22] = vsubq_s32(al[17], al[22]);
+ br[22] = vsubq_s32(ar[17], ar[22]);
+ bl[23] = vsubq_s32(al[16], al[23]);
+ br[23] = vsubq_s32(ar[16], ar[23]);
+
+ bl[24] = vsubq_s32(al[31], al[24]);
+ br[24] = vsubq_s32(ar[31], ar[24]);
+ bl[25] = vsubq_s32(al[30], al[25]);
+ br[25] = vsubq_s32(ar[30], ar[25]);
+ bl[26] = vsubq_s32(al[29], al[26]);
+ br[26] = vsubq_s32(ar[29], ar[26]);
+ bl[27] = vsubq_s32(al[28], al[27]);
+ br[27] = vsubq_s32(ar[28], ar[27]);
+
+ bl[28] = vaddq_s32(al[28], al[27]);
+ br[28] = vaddq_s32(ar[28], ar[27]);
+ bl[29] = vaddq_s32(al[29], al[26]);
+ br[29] = vaddq_s32(ar[29], ar[26]);
+ bl[30] = vaddq_s32(al[30], al[25]);
+ br[30] = vaddq_s32(ar[30], ar[25]);
+ bl[31] = vaddq_s32(al[31], al[24]);
+ br[31] = vaddq_s32(ar[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+ -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+ -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+ &bl[2], &br[2], &bl[3], &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+ &bl[14], &br[14], &bl[9], &br[9]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+ -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+ &al[4], &ar[4], &al[7], &ar[7]);
+ butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+ &al[5], &ar[5], &al[6], &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+ -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+ cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+ -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+ &bl[8], &br[8], &bl[15], &br[15]);
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+ cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+ butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+ cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+ cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+ cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+ cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+ cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+ cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct4x4_neon.c
index 2827791f1..3b9196fae 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
- int i;
// input[M * stride] * 16
int16x4_t in[4];
in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
in[0] = vadd_s16(in[0], one);
}
- for (i = 0; i < 2; ++i) {
- vpx_fdct4x4_pass1_neon(in);
- }
+ vpx_fdct4x4_pass1_neon(in);
+ vpx_fdct4x4_pass2_neon(in);
{
// Not quite a rounding shift. Only add 1 despite shifting by 2.
const int16x8_t one = vdupq_n_s16(1);
@@ -48,3 +47,39 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
store_s16q_to_tran_low(final_output + 1 * 8, out_23);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+ const int32x4_t const_one = vdupq_n_s32(1);
+
+ // input[M * stride] * 16
+ int32x4_t in[4];
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ in[0] = vaddq_s32(in[0], const_1000);
+ }
+
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+ in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+ in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+ in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+ vst1q_s32(final_output, in[0]);
+ vst1q_s32(final_output + 4, in[1]);
+ vst1q_s32(final_output + 8, in[2]);
+ vst1q_s32(final_output + 12, in[3]);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/libvpx/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 000000000..de3db9774
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+ &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+ int32x4_t out[4];
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+ const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+ const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+ butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+ &out[1], &out[3]);
+
+ transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/libvpx/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 000000000..75ee6f223
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // stage 1
+ int16x8_t in[8];
+ in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+ vpx_fdct8x8_pass1_neon(in);
+ vpx_fdct8x8_pass2_neon(in);
+ {
+ // from vpx_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+ in[0] = vhsubq_s16(in[0], sign_in0);
+ in[1] = vhsubq_s16(in[1], sign_in1);
+ in[2] = vhsubq_s16(in[2], sign_in2);
+ in[3] = vhsubq_s16(in[3], sign_in3);
+ in[4] = vhsubq_s16(in[4], sign_in4);
+ in[5] = vhsubq_s16(in[5], sign_in5);
+ in[6] = vhsubq_s16(in[6], sign_in6);
+ in[7] = vhsubq_s16(in[7], sign_in7);
+ // store results
+ store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+ store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+ store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+ store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+ store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+ store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+ store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+ store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int32x4_t left[8], right[8];
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+
+ left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ vpx_highbd_fdct8x8_pass2_neon(left, right);
+ {
+ left[0] = add_round_shift_half_s32(left[0]);
+ left[1] = add_round_shift_half_s32(left[1]);
+ left[2] = add_round_shift_half_s32(left[2]);
+ left[3] = add_round_shift_half_s32(left[3]);
+ left[4] = add_round_shift_half_s32(left[4]);
+ left[5] = add_round_shift_half_s32(left[5]);
+ left[6] = add_round_shift_half_s32(left[6]);
+ left[7] = add_round_shift_half_s32(left[7]);
+ right[0] = add_round_shift_half_s32(right[0]);
+ right[1] = add_round_shift_half_s32(right[1]);
+ right[2] = add_round_shift_half_s32(right[2]);
+ right[3] = add_round_shift_half_s32(right[3]);
+ right[4] = add_round_shift_half_s32(right[4]);
+ right[5] = add_round_shift_half_s32(right[5]);
+ right[6] = add_round_shift_half_s32(right[6]);
+ right[7] = add_round_shift_half_s32(right[7]);
+
+ // store results
+ vst1q_s32(final_output, left[0]);
+ vst1q_s32(final_output + 4, right[0]);
+ vst1q_s32(final_output + 8, left[1]);
+ vst1q_s32(final_output + 12, right[1]);
+ vst1q_s32(final_output + 16, left[2]);
+ vst1q_s32(final_output + 20, right[2]);
+ vst1q_s32(final_output + 24, left[3]);
+ vst1q_s32(final_output + 28, right[3]);
+ vst1q_s32(final_output + 32, left[4]);
+ vst1q_s32(final_output + 36, right[4]);
+ vst1q_s32(final_output + 40, left[5]);
+ vst1q_s32(final_output + 44, right[5]);
+ vst1q_s32(final_output + 48, left[6]);
+ vst1q_s32(final_output + 52, right[6]);
+ vst1q_s32(final_output + 56, left[7]);
+ vst1q_s32(final_output + 60, right[7]);
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/libvpx/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 000000000..d8fa60044
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+ &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass2_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+ &left[2], &right[2], &left[6], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+ &left[1], &right[1], &left[7], &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+ &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[2], &right[2], &left[6],
+ &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[1], &right[1], &left[7],
+ &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[5], &right[5], &left[3],
+ &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.h b/libvpx/vpx_dsp/arm/fdct_neon.h
index 28d7d86bf..193594e3d 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.h
+++ b/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -13,201 +13,411 @@
#include <arm_neon.h>
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
- const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
- const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
- // in_0 +/- in_3, in_1 +/- in_2
- const int16x8_t s_01 = vaddq_s16(input_01, input_32);
- const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
- // step_0 +/- step_1, step_2 +/- step_3
- const int16x4_t s_0 = vget_low_s16(s_01);
- const int16x4_t s_1 = vget_high_s16(s_01);
- const int16x4_t s_2 = vget_high_s16(s_32);
- const int16x4_t s_3 = vget_low_s16(s_32);
-
- // (s_0 +/- s_1) * cospi_16_64
- // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
- const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
- const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
- // fdct_round_shift
- int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
- int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
- // s_3 * cospi_8_64 + s_2 * cospi_24_64
- // s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
- // fdct_round_shift
- int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
- int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
- in[0] = out_0;
- in[1] = out_1;
- in[2] = out_2;
- in[3] = out_3;
-}
-
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
- int16x8_t *out) {
- const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
- const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
- const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
- const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
- const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
- const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
- const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
- const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
- // fdct4(step, step);
- int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
- int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
- int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
- int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
- // fdct4(step, step);
- int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
- out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
- out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
- out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
- }
- // Stage 2
- v_x0 = vsubq_s16(v_s6, v_s5);
- v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x8_t ab = vcombine_s16(a, b);
- const int16x8_t cd = vcombine_s16(c, d);
- // Stage 3
- v_x0 = vaddq_s16(v_s4, ab);
- v_x1 = vsubq_s16(v_s4, ab);
- v_x2 = vsubq_s16(v_s7, cd);
- v_x3 = vaddq_s16(v_s7, cd);
- }
- // Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
- out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
- out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
- out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
- }
-}
-
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
- int16x8_t out[8];
- vpx_fdct8x8_pass1_notranspose_neon(in, out);
- // transpose 8x8
- // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
- // columns.
- {
- // 00 01 02 03 40 41 42 43
- // 10 11 12 13 50 51 52 53
- // 20 21 22 23 60 61 62 63
- // 30 31 32 33 70 71 72 73
- // 04 05 06 07 44 45 46 47
- // 14 15 16 17 54 55 56 57
- // 24 25 26 27 64 65 66 67
- // 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
- const int32x4x2_t r13_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
- const int32x4x2_t r46_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
- const int32x4x2_t r57_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
- const int16x8x2_t r01_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
- vreinterpretq_s16_s32(r13_s32.val[0]));
- const int16x8x2_t r23_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
- vreinterpretq_s16_s32(r13_s32.val[1]));
- const int16x8x2_t r45_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
- vreinterpretq_s16_s32(r57_s32.val[0]));
- const int16x8x2_t r67_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
- vreinterpretq_s16_s32(r57_s32.val[1]));
- in[0] = r01_s16.val[0];
- in[1] = r01_s16.val[1];
- in[2] = r23_s16.val[0];
- in[3] = r23_s16.val[1];
- in[4] = r45_s16.val[0];
- in[5] = r45_s16.val[1];
- in[6] = r67_s16.val[0];
- in[7] = r67_s16.val[1];
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant,
+ int16x4_t *add,
+ int16x4_t *sub) {
+ int16x4_t c = vdup_n_s16(2 * constant);
+ *add = vqrdmulh_s16(vadd_s16(a, b), c);
+ *sub = vqrdmulh_s16(vsub_s16(a, b), c);
}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+ const int16x8_t b,
+ const tran_coef_t constant,
+ int16x8_t *add,
+ int16x8_t *sub) {
+ int16x8_t c = vdupq_n_s16(2 * constant);
+ *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+ *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ const int16x4_t a_lo = vget_low_s16(a);
+ const int16x4_t a_hi = vget_high_s16(a);
+ const int16x4_t b_lo = vget_low_s16(b);
+ const int16x4_t b_hi = vget_high_s16(b);
+ *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+ butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+ &sub_hi);
+ *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+ *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int32x4_t *add, int32x4_t *sub) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+ *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int16x4_t *add, int16x4_t *sub) {
+ int32x4_t add32, sub32;
+ butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+ *add = vmovn_s32(add32);
+ *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+ butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+ &sub32_hi);
+ *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+ *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+ const int32x4_t b,
+ const tran_coef_t constant,
+ int32x4_t *add,
+ int32x4_t *sub) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+ *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+ const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+ const int32x2_t a_lo = vget_low_s32(a);
+ const int32x2_t a_hi = vget_high_s32(a);
+ const int32x2_t b_lo = vget_low_s32(b);
+ const int32x2_t b_hi = vget_high_s32(b);
+
+ const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+ const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+ const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+ const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+ const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+ const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+ const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+ const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+ *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+ *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+ const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+ const int16x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+ const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+ const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+ const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+ *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+ *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+ *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+ *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x4_t *add, int16x4_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(a, constant1);
+ const int32x4_t a2 = vmull_n_s16(a, constant2);
+ const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+ const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+ *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+ *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+ const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+ const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+ const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+ const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+ const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+ const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
index 0a1cdca41..718dba0d9 100644
--- a/libvpx/vpx_dsp/arm/fdct_partial_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
output[0] = (tran_low_t)(sum >> 3);
output[1] = 0;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+ r++;
+ } while (r < 16);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a0 = vld1q_s16(input);
+ const int16x8_t a1 = vld1q_s16(input + 8);
+ const int16x8_t a2 = vld1q_s16(input + 16);
+ const int16x8_t a3 = vld1q_s16(input + 24);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+ r++;
+ } while (r < 32);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index d9161c6d3..000000000
--- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/fdct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-
-void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
- int stride) {
- int i;
- // stage 1
- int16x8_t in[8];
- in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
- in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
- in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
- in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
- in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
- in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
- in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
- in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
- for (i = 0; i < 2; ++i) {
- vpx_fdct8x8_pass1_neon(in);
- } // for
- {
- // from vpx_dct_sse2.c
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
- const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
- const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
- const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
- const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
- const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
- const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
- const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
- in[0] = vhsubq_s16(in[0], sign_in0);
- in[1] = vhsubq_s16(in[1], sign_in1);
- in[2] = vhsubq_s16(in[2], sign_in2);
- in[3] = vhsubq_s16(in[3], sign_in3);
- in[4] = vhsubq_s16(in[4], sign_in4);
- in[5] = vhsubq_s16(in[5], sign_in5);
- in[6] = vhsubq_s16(in[6], sign_in6);
- in[7] = vhsubq_s16(in[7], sign_in7);
- // store results
- store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
- store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
- store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
- store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
- store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
- store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
- store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
- store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
- }
-}
diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c
index 523a63c6f..f6b6d7e3c 100644
--- a/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
coeff += 8;
}
}
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 32x32 to 16x64 and remove stride.
+ * Top left first. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vhaddq_s16(b0, b2);
+ const int16x8_t c1 = vhaddq_s16(b1, b3);
+ const int16x8_t c2 = vhsubq_s16(b0, b2);
+ const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 256, c1);
+ store_s16q_to_tran_low(coeff + 512, c2);
+ store_s16q_to_tran_low(coeff + 768, c3);
+
+ coeff += 8;
+ }
+}
diff --git a/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 000000000..b9f72a94c
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+ const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+ tran_low_t *dqcoeff_ptr) {
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+ const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+ int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+ const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+ const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+ const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+ // Calculate 2 masks of elements outside the bin
+ const int32x4_t zbin_mask_0 =
+ vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+ const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+ vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+ // Get the rounded values
+ const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+ const int32x4_t rounded_1 =
+ vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+ // (round * (quant << 15) * 2) >> 16 == (round * quant)
+ int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+ int32x4_t qcoeff_tmp_1 =
+ vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+ // Add rounded values
+ qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+ qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+ // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+ qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+ qcoeff_tmp_1 =
+ vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+ // Restore the sign bit.
+ qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+ qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+ // Only keep the relevant coeffs
+ *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+ *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant,
+ const int32x4_t quant_shift, const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+ int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#ifdef __aarch64__
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+ int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+ const int32x4_t quant, const int32x4_t quant_shift,
+ const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
+ int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#ifdef __aarch64__
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
diff --git a/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/libvpx/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 000000000..ecb52ce5a
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
+ const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
+ sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
+ const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
+ sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
+ vget_high_u16(ref_u16));
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, const uint8_t *second_pred, int width, int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
+ const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
+ const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
+ const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
+ sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred_ptr += width;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, const uint8_t *second_pred, int width, int height) {
+ int i, j;
+ uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
+ const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
+ const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
+ const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
+ sum_abs_diff =
+ vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
+ }
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred_ptr += width;
+ }
+
+ return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+#define highbd_sad4MxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ }
+
+#define highbd_sadMxN(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ }
+
+#define highbd_sad4MxN_avg(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+ second_pred, m, n); \
+ }
+
+#define highbd_sadMxN_avg(m, n) \
+ unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+ second_pred, m, n); \
+ }
+
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_neon( \
+ const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], int ref_stride, \
+ uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ }
+
+/* clang-format off */
+// 4x4
+highbd_sad4MxN(4, 4)
+highbd_sad4MxN_avg(4, 4)
+highbd_sadMxNx4D(4, 4)
+
+// 4x8
+highbd_sad4MxN(4, 8)
+highbd_sad4MxN_avg(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxN_avg(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxN_avg(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxN_avg(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxN_avg(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxN_avg(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxN_avg(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxN_avg(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxN_avg(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxN_avg(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxN_avg(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxN_avg(64, 64)
+highbd_sadMxNx4D(64, 64)
+ /* clang-format on */
diff --git a/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/libvpx/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 000000000..96a35af01
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ int w, int h, uint64_t *sse,
+ int64_t *sum) {
+ int i, j;
+
+ if (w >= 8) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
+ const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
+ const int32x4_t diff1_s32 =
+ vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
+ const int32x4_t diff2_s32 =
+ vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
+ const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
+ const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff1_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff2_s32);
+ sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
+ sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_uint32x4(sse_u32);
+ } else {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ assert(w >= 4);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 4) {
+ const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
+ const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
+ const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
+ const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
+ sum_s32 = vaddq_s32(sum_s32, diff_s32);
+ sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_uint32x4(sse_u32);
+ }
+}
+
+static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+
+ if (w < 32 && h < 32) {
+ highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
+ } else {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ int k, l;
+ for (k = 0; k + 16 <= h; k += 16) {
+ for (l = 0; l + 16 <= w; l += 16) {
+ uint64_t sse_tmp = 0;
+ int64_t sum_tmp = 0;
+ highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
+ 16, &sse_tmp, &sum_tmp);
+ sum_long += sum_tmp;
+ sse_long += sse_tmp;
+ }
+ src_ptr += 16 * src_stride;
+ ref_ptr += 16 * ref_stride;
+ }
+ *sum = sum_long;
+ *sse = sse_long;
+ }
+}
+
+static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)sse_long;
+ *sum = (int)sum_long;
+}
+
+static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+ &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+ uint32_t vpx_highbd_8_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+ sum); \
+ }
+
+#define HIGHBD_MSE(W, H) \
+ uint32_t vpx_highbd_8_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+ &sum); \
+ return *sse; \
+ }
+
+static INLINE void highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ uint32_t i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+
+ uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+ uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+ uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+ const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+ uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+ uint16x4_t out1_u16;
+ uint16x4_t out2_u16;
+ sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+ sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+ out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+ out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+ vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ } else {
+ assert(output_width >= 4);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+ const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+ uint16x4_t out_u16;
+ sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+ out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+ vst1_u16(&output_ptr[j], out_u16);
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+static INLINE void highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ uint32_t i, j;
+
+ uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+ uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+ uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+ const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+ uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+ uint16x4_t out1_u16;
+ uint16x4_t out2_u16;
+ sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+ sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+ out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+ out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+ vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ } else {
+ assert(output_width >= 4);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+ const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+ uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+ uint16x4_t out_u16;
+ sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+ out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+ vst1_u16(&output_ptr[j], out_u16);
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
+ }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_10_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
+ } \
+ \
+ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \
+ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ highbd_var_filter_block2d_bil_first_pass( \
+ src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[y_offset]); \
+ \
+ vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
+ H, temp2, W); \
+ \
+ return vpx_highbd_12_variance##W##x##H##_neon( \
+ CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
+ }
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ uint32x4_t one_u32 = vdupq_n_u32(1);
+ if (width >= 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 8) {
+ const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+ const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+ const uint32x4_t sum1_u32 =
+ vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+ const uint32x4_t sum2_u32 =
+ vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+ const uint16x4_t sum1_u16 =
+ vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+ const uint16x4_t sum2_u16 =
+ vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+ const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+ vst1q_u16(&comp_pred[j], vcomp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else {
+ assert(width >= 4);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 4) {
+ const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+ const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+ const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+ const uint16x4_t vcomp_pred =
+ vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+ vst1_u16(&comp_pred[j], vcomp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ }
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h
index 50aaa94fe..19cfc7c7f 100644
--- a/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/libvpx/vpx_dsp/arm/mem_neon.h
@@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x2_t a_u32 = vdup_n_u32(0);
+ uint32x2_t a_u32;
if (stride == 4) return vld1_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vset_lane_u32(a, a_u32, 0);
+ a_u32 = vdup_n_u32(a);
memcpy(&a, buf, 4);
a_u32 = vset_lane_u32(a, a_u32, 1);
return vreinterpret_u8_u32(a_u32);
@@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x4_t a_u32 = vdupq_n_u32(0);
+ uint32x4_t a_u32;
if (stride == 4) return vld1q_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vsetq_lane_u32(a, a_u32, 0);
+ a_u32 = vdupq_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
buf += stride;
vst1_lane_u32((uint32_t *)buf, a_u32, 1);
}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t s4, const uint8x8_t s5,
+ const uint8x8_t s6, const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3,
+ const uint8x16_t s4, const uint8x16_t s5,
+ const uint8x16_t s6, const uint8x16_t s7) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+}
+
#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c
index bd7818a07..9c227d560 100644
--- a/libvpx/vpx_dsp/arm/quantize_neon.c
+++ b/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -17,20 +17,57 @@
static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
const int16x8_t dequant,
- tran_low_t *dqcoeff) {
+ tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
const int32x4_t dqcoeff_0 =
vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
const int32x4_t dqcoeff_1 =
vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-#if CONFIG_VP9_HIGHBITDEPTH
- vst1q_s32(dqcoeff, dqcoeff_0);
- vst1q_s32(dqcoeff + 4, dqcoeff_1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
#else
- vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+ vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
@@ -38,109 +75,59 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
- const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
- (void)scan;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vld1q_s16(zbin_ptr);
+ int16x8_t round = vld1q_s16(round_ptr);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
// Process first 8 values which include a dc component.
{
- // Only the first element of each vector is DC.
- const int16x8_t zbin = vld1q_s16(zbin_ptr);
- const int16x8_t round = vld1q_s16(round_ptr);
- const int16x8_t quant = vld1q_s16(quant_ptr);
- const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
- const int16x8_t dequant = vld1q_s16(dequant_ptr);
- // Add one because the eob does not index from 0.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
- qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+ quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
n_coeffs -= 8;
{
- const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
- const int16x8_t round = vdupq_n_s16(round_ptr[1]);
- const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
- const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
- const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
do {
- // Add one because the eob is not its index.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
- qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
-
n_coeffs -= 8;
} while (n_coeffs > 0);
}
@@ -156,6 +143,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)scan;
}
static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -164,7 +154,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
const int16x8_t dequant,
- tran_low_t *dqcoeff) {
+ tran_low_t *dqcoeff_ptr) {
int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
int32x4_t dqcoeff_1 =
vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -176,14 +166,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
#if CONFIG_VP9_HIGHBITDEPTH
dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
- vst1q_s32(dqcoeff, dqcoeff_0);
- vst1q_s32(dqcoeff + 4, dqcoeff_1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
#else
- vst1q_s16(dqcoeff,
+ vst1q_s16(dqcoeff_ptr,
vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -194,107 +221,57 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
int i;
- (void)scan;
- (void)n_coeffs; // Because we will always calculate 32*32.
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
// Process first 8 values which include a dc component.
{
- // Only the first element of each vector is DC.
- const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
- const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- const int16x8_t quant = vld1q_s16(quant_ptr);
- const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
- const int16x8_t dequant = vld1q_s16(dequant_ptr);
- // Add one because the eob does not index from 0.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
- qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
{
- const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
- const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
- const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
- const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
- const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
for (i = 1; i < 32 * 32 / 8; ++i) {
- // Add one because the eob is not its index.
- const uint16x8_t v_iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
- const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
- const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
- const int16x8_t coeff_abs = vabsq_s16(coeff);
-
- const int16x8_t zbin_mask =
- vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
- const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
- // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
- int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
- qcoeff = vaddq_s16(qcoeff, rounded);
-
- // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
- qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
- // Restore the sign bit.
- qcoeff = veorq_s16(qcoeff, coeff_sign);
- qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
- qcoeff = vandq_s16(qcoeff, zbin_mask);
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+ __builtin_prefetch(coeff_ptr + 64);
coeff_ptr += 8;
iscan += 8;
-
- store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
-
- calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
}
@@ -310,4 +287,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
#endif // __aarch64__
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
}
diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
index 03f716c3d..5fc621aee 100644
--- a/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -20,9 +20,9 @@
static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
const void *const buf1) {
uint32_t a;
- uint32x2_t aa = vdup_n_u32(0);
+ uint32x2_t aa;
memcpy(&a, buf0, 4);
- aa = vset_lane_u32(a, aa, 0);
+ aa = vdup_n_u32(a);
memcpy(&a, buf1, 4);
aa = vset_lane_u32(a, aa, 1);
return vreinterpret_u8_u32(aa);
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint32x4_t *const sum) {
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint16x8_t *const sum) {
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
sad_512_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_2048_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
-#else
+#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_4096_pel_final_neon(sum, sad_array);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
index b1509d883..ad575d4aa 100644
--- a/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c
@@ -21,9 +21,15 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+#if defined(__ARM_FEATURE_DOTPROD)
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+ const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+ return horizontal_add_uint32x4(dp);
+#else
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -33,13 +39,34 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+#if defined(__ARM_FEATURE_DOTPROD)
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
+ const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+ return horizontal_add_uint32x4(prod);
+#else
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
+#if defined(__ARM_FEATURE_DOTPROD)
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t src2_u8 =
+ load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+ const uint8x16_t ref2_u8 =
+ load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+ const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
+ const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
+ prod = vdotq_u32(prod, sad1_u8, ones);
+ prod = vdotq_u32(prod, sad2_u8, ones);
+ return horizontal_add_uint32x4(prod);
+#else
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < 8; i += 4) {
@@ -52,11 +79,31 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
}
return horizontal_add_uint16x8(abs);
+#endif
}
uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const uint8_t *second_pred) {
+#if defined(__ARM_FEATURE_DOTPROD)
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+ const uint8x16_t src2_u8 =
+ load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+ const uint8x16_t ref2_u8 =
+ load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+ const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
+ const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
+ const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
+ const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
+ const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
+ const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
+ prod = vdotq_u32(prod, sad1_u8, ones);
+ prod = vdotq_u32(prod, sad2_u8, ones);
+ return horizontal_add_uint32x4(prod);
+#else
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < 8; i += 4) {
@@ -72,8 +119,65 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
}
return horizontal_add_uint16x8(abs);
+#endif
}
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x2_t prod = vdup_n_u32(0);
+ const uint8x8_t ones = vdup_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdot_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x2_t prod = vdup_n_u32(0);
+ const uint8x8_t ones = vdup_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ const uint8x8_t c_u8 = vld1_u8(second_pred);
+ const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+ const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ prod = vdot_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD8XN(n) \
+ uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x2_t prod = \
+ sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x2(prod); \
+ } \
+ \
+ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x2_t prod = \
+ sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x2(prod); \
+ }
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -124,11 +228,67 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD8XN(4)
SAD8XN(8)
SAD8XN(16)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t src_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
+ const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+ const uint8x16_t c_u8 = vld1q_u8(second_pred);
+ const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+ const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ prod = vdotq_u32(prod, sad_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD16XN(n) \
+ uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x4_t prod = \
+ sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x4(prod); \
+ } \
+ \
+ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t prod = \
+ sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x4(prod); \
+ }
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -182,11 +342,78 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD16XN(8)
SAD16XN(16)
SAD16XN(32)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
+ const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_lo_u8, ones);
+ prod = vdotq_u32(prod, sad_hi_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t c_lo = vld1q_u8(second_pred);
+ const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
+ const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+ const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+ const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
+ const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ prod = vdotq_u32(prod, sad_lo_u8, ones);
+ prod = vdotq_u32(prod, sad_hi_u8, ones);
+ }
+ return prod;
+}
+
+#define SAD32XN(n) \
+ uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x4_t prod = \
+ sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return horizontal_add_uint32x4(prod); \
+ } \
+ \
+ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t prod = \
+ sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return horizontal_add_uint32x4(prod); \
+ }
+
+#else // defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -250,11 +477,81 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
SAD32XN(16)
SAD32XN(32)
SAD32XN(64)
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
+ const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
+ const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
+ const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ prod = vdotq_u32(prod, sad_0_u8, ones);
+ prod = vdotq_u32(prod, sad_1_u8, ones);
+ prod = vdotq_u32(prod, sad_2_u8, ones);
+ prod = vdotq_u32(prod, sad_3_u8, ones);
+ }
+ return prod;
+}
+
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
+ int i;
+ uint32x4_t prod = vdupq_n_u32(0);
+ const uint8x16_t ones = vdupq_n_u8(1);
+ for (i = 0; i < height; ++i) {
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ const uint8x16_t c_0 = vld1q_u8(second_pred);
+ const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+ const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+ const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
+ const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+ const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+ const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+ const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+ const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
+ const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
+ const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
+ const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ prod = vdotq_u32(prod, sad_0_u8, ones);
+ prod = vdotq_u32(prod, sad_1_u8, ones);
+ prod = vdotq_u32(prod, sad_2_u8, ones);
+ prod = vdotq_u32(prod, sad_3_u8, ones);
+ }
+ return prod;
+}
+#else // !defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -332,6 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
return vpadalq_u16(sum, abs_1);
}
}
+#endif // defined(__ARM_FEATURE_DOTPROD)
#define SAD64XN(n) \
uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index a3befdc34..9328c3ed8 100644
--- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -17,168 +17,474 @@
#include "vpx_dsp/variance.h"
#include "vpx_dsp/arm/mem_neon.h"
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
-};
-
// Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; i += 2) {
- const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
- const uint8x8_t src_1 =
- load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(output_ptr, out);
- src_ptr += 2 * src_pixels_per_line;
- output_ptr += 8;
- }
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
}
// Process a block exactly 8 wide and any height.
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; ++i) {
- const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
- const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(output_ptr, out);
- src_ptr += src_pixels_per_line;
- output_ptr += 8;
- }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
}
// Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- const uint8x8_t f0 = vdup_n_u8(filter[0]);
- const uint8x8_t f1 = vdup_n_u8(filter[1]);
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 16) {
- const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
- const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
- const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
- const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
- const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
- const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
- const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
- const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
- }
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+ uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+ vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
}
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_VARIANCENXM(n, m) \
- uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
- uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
- uint8_t temp1[n * m]; \
- \
- if (n == 4) { \
- var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else if (n == 8) { \
- var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else { \
- var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
- bilinear_filters[y_offset]); \
- } \
- return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
}
-SUB_PIXEL_VARIANCENXM(4, 4)
-SUB_PIXEL_VARIANCENXM(4, 8)
-SUB_PIXEL_VARIANCENXM(8, 4)
-SUB_PIXEL_VARIANCENXM(8, 8)
-SUB_PIXEL_VARIANCENXM(8, 16)
-SUB_PIXEL_VARIANCENXM(16, 8)
-SUB_PIXEL_VARIANCENXM(16, 16)
-SUB_PIXEL_VARIANCENXM(16, 32)
-SUB_PIXEL_VARIANCENXM(32, 16)
-SUB_PIXEL_VARIANCENXM(32, 32)
-SUB_PIXEL_VARIANCENXM(32, 64)
-SUB_PIXEL_VARIANCENXM(64, 32)
-SUB_PIXEL_VARIANCENXM(64, 64)
-
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \
- uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
- const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
- uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
- uint8_t temp1[n * m]; \
- \
- if (n == 4) { \
- var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else if (n == 8) { \
- var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
- bilinear_filters[y_offset]); \
- } else { \
- var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
- bilinear_filters[x_offset]); \
- var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
- bilinear_filters[y_offset]); \
- } \
- \
- vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \
- \
- return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
}
-SUB_PIXEL_AVG_VARIANCENXM(4, 4)
-SUB_PIXEL_AVG_VARIANCENXM(4, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 4)
-SUB_PIXEL_AVG_VARIANCENXM(8, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 8)
-SUB_PIXEL_AVG_VARIANCENXM(16, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 16)
-SUB_PIXEL_AVG_VARIANCENXM(32, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 64)
-SUB_PIXEL_AVG_VARIANCENXM(64, 32)
-SUB_PIXEL_AVG_VARIANCENXM(64, 64)
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/libvpx/vpx_dsp/arm/subtract_neon.c b/libvpx/vpx_dsp/arm/subtract_neon.c
index 612897e24..2c008e48a 100644
--- a/libvpx/vpx_dsp/arm/subtract_neon.c
+++ b/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
} while (r);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ int r = rows, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+
+ if (cols >= 16) {
+ do {
+ for (c = 0; c < cols; c += 16) {
+ const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t d0 = vsubq_u16(s0, p0);
+ const uint16x8_t d1 = vsubq_u16(s1, p1);
+ vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 8) {
+ do {
+ for (c = 0; c < cols; c += 8) {
+ const uint16x8_t s = vld1q_u16(&src[c]);
+ const uint16x8_t p = vld1q_u16(&pred[c]);
+ const uint16x8_t d0 = vsubq_u16(s, p);
+ vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 4) {
+ do {
+ for (c = 0; c < cols; c += 4) {
+ const uint16x4_t s = vld1_u16(&src[c]);
+ const uint16x4_t p = vld1_u16(&pred[c]);
+ const uint16x4_t v_diff = vsub_u16(s, p);
+ vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h
index c098ad31b..41d44f2b1 100644
--- a/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
@@ -787,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
a7->val[1] = c7.val[1];
}
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+ int32x4_t *right /*[8]*/,
+ int32x4_t *out_left /*[8]*/,
+ int32x4_t *out_right /*[8]*/) {
+ int32x4x2_t out[8];
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ out_left[0] = out[0].val[0];
+ out_left[1] = out[1].val[0];
+ out_left[2] = out[2].val[0];
+ out_left[3] = out[3].val[0];
+ out_left[4] = out[4].val[0];
+ out_left[5] = out[5].val[0];
+ out_left[6] = out[6].val[0];
+ out_left[7] = out[7].val[0];
+ out_right[0] = out[0].val[1];
+ out_right[1] = out[1].val[1];
+ out_right[2] = out[2].val[1];
+ out_right[3] = out[3].val[1];
+ out_right[4] = out[4].val[1];
+ out_right[5] = out[5].val[1];
+ out_right[6] = out[6].val[1];
+ out_right[7] = out[7].val[1];
+}
+
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
index 7b93f142b..3ccc4e807 100644
--- a/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libvpx/vpx_dsp/arm/variance_neon.c
@@ -19,345 +19,357 @@
#include "vpx_dsp/arm/sum_neon.h"
#include "vpx_ports/mem.h"
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i;
- uint32x4_t sum_a = vdupq_n_u32(0);
- uint32x4_t sum_b = vdupq_n_u32(0);
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
uint32x4_t sse_u32 = vdupq_n_u32(0);
- for (i = 0; i < h; i += 4) {
- const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
- const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+ int i = h;
+ do {
+ const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
- const uint8x16_t abs_diff = vabdq_u8(a, b);
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
- sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
- sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
src_ptr += 4 * src_stride;
ref_ptr += 4 * ref_stride;
- }
+ i -= 4;
+ } while (i != 0);
- *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
*sse = horizontal_add_uint32x4(sse_u32);
}
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int w,
- int h, uint32_t *sse, int *sum) {
- int i, j;
- uint32x4_t sum_a = vdupq_n_u32(0);
- uint32x4_t sum_b = vdupq_n_u32(0);
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
uint32x4_t sse_u32 = vdupq_n_u32(0);
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const uint8x16_t a = vld1q_u8(src_ptr + j);
- const uint8x16_t b = vld1q_u8(ref_ptr + j);
+ int i = h;
+ do {
+ const uint8x16_t s =
+ vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+ const uint8x16_t r =
+ vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
- const uint8x16_t abs_diff = vabdq_u8(a, b);
- sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
- sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
- sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
- }
src_ptr += src_stride;
ref_ptr += ref_stride;
- }
+ } while (--i != 0);
- *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
*sse = horizontal_add_uint32x4(sse_u32);
}
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i = 0;
- uint32x2_t sum_a = vdup_n_u32(0);
- uint32x2_t sum_b = vdup_n_u32(0);
- uint32x2_t sse_lo_u32 = vdup_n_u32(0);
- uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int i = h;
do {
- const uint8x8_t a_0 = vld1_u8(src_ptr);
- const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
- const uint8x8_t b_0 = vld1_u8(ref_ptr);
- const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
-
- const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
- const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
- sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
- sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
-
- sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
- sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
- sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
- sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
-
- src_ptr += src_stride + src_stride;
- ref_ptr += ref_stride + ref_stride;
- i += 2;
- } while (i < h);
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ j += 16;
+ } while (j < w);
- *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
- *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
}
-#else
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i;
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int i = h;
- // Since width is only 4, sum_s16 only loads a half row per loop.
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
assert(h <= 256);
- for (i = 0; i < h; i += 4) {
- const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
- const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
- const uint16x8_t diff_lo_u16 =
- vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
- const uint16x8_t diff_hi_u16 =
- vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
- const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
- const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
- sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+ do {
+ const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
- vget_low_s16(diff_lo_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
- vget_high_s16(diff_lo_s16));
+ sum_s16 = vaddq_s16(sum_s16, diff);
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
- vget_low_s16(diff_hi_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
- vget_high_s16(diff_hi_s16));
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
- src_ptr += 4 * src_stride;
- ref_ptr += 4 * ref_stride;
- }
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
*sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
}
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int w,
- int h, uint32_t *sse, int *sum) {
- int i, j;
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
-
- // The loop loads 16 values at a time but doubles them up when accumulating
- // into sum_s16.
- assert(w / 8 * h <= 128);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
- const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
-
- const uint16x8_t diff_lo_u16 =
- vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
- const uint16x8_t diff_hi_u16 =
- vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
- const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
- const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
- sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
- vget_low_s16(diff_lo_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
- vget_high_s16(diff_lo_s16));
-
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
- vget_low_s16(diff_hi_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
- vget_high_s16(diff_hi_s16));
- }
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ do {
+ const uint8x8_t s = vld1_u8(src_ptr);
+ const uint8x8_t r = vld1_u8(ref_ptr);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
src_ptr += src_stride;
ref_ptr += ref_stride;
- }
+ } while (--i != 0);
*sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
}
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride, int h,
- uint32_t *sse, int *sum) {
- int i = 0;
- int16x8_t sum_s16 = vdupq_n_s16(0);
- int32x4_t sse_lo_s32 = vdupq_n_s32(0);
- int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
- // Each column has it's own accumulator entry in sum_s16.
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
assert(h <= 128);
do {
- const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
- const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
- const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
- const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
- const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
- const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
- const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
- const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
- sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
- sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
- vget_low_s16(diff_0_s16));
- sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
- vget_low_s16(diff_1_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
- vget_high_s16(diff_0_s16));
- sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
- vget_high_s16(diff_1_s16));
- src_ptr += src_stride + src_stride;
- ref_ptr += ref_stride + ref_stride;
- i += 2;
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, int h_limit,
+ unsigned int *sse, int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
} while (i < h);
- *sum = horizontal_add_int16x8(sum_s16);
- *sse = horizontal_add_uint32x4(
- vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
}
-#endif
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+ variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
}
void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+ variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
}
-#define VARIANCENXM(n, m, shift) \
- unsigned int vpx_variance##n##x##m##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride, unsigned int *sse) { \
- int sum; \
- if (n == 4) \
- variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \
- &sum); \
- else if (n == 8) \
- variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \
- &sum); \
- else \
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
- &sum); \
- if (n * m < 16 * 16) \
- return *sse - ((sum * sum) >> shift); \
- else \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int vpx_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
}
-VARIANCENXM(4, 4, 4)
-VARIANCENXM(4, 8, 5)
-VARIANCENXM(8, 4, 5)
-VARIANCENXM(8, 8, 6)
-VARIANCENXM(8, 16, 7)
-VARIANCENXM(16, 8, 7)
-VARIANCENXM(16, 16, 8)
-VARIANCENXM(16, 32, 9)
-VARIANCENXM(32, 16, 9)
-VARIANCENXM(32, 32, 10)
-
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
- ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
- ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
-
- variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
- &sum1);
- variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
- ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
- ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
- ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
- &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
@@ -421,7 +433,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return vget_lane_u32(sse, 0);
}
-#else
+#else // !defined(__ARM_FEATURE_DOTPROD)
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +530,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
}
-#endif
+#endif // defined(__ARM_FEATURE_DOTPROD)
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 06b58c438..b4cdd58c7 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,8 +31,9 @@
// instructions. This optimization is much faster in speed unit test, but slowed
// down the whole decoder by 5%.
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
@@ -53,9 +54,176 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
- int8x8_t *a2, int8x8_t *a3,
- int8x16_t *b,
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
const uint8x16_t permute_tbl) {
/* Transpose 8-bit elements and concatenate result rows as follows:
* a0: 00, 01, 02, 03, XX, XX, XX, XX
@@ -70,13 +238,13 @@ static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
* inline helper is called many times from the same parent function.
*/
- int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
}
-static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
- int8x8_t *a2, int8x8_t *a3,
- int8x16_t *b0, int8x16_t *b1,
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
const uint8x16x2_t permute_tbl) {
/* Transpose 8-bit elements and concatenate result rows as follows:
* a0: 00, 01, 02, 03, 04, 05, 06, 07
@@ -92,11 +260,364 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
* inline helper is called many times from the same parent function.
*/
- int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
}
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#else // !defined(__ARM_FEATURE_MATMUL_INT8)
+
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
@@ -125,33 +646,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int16x8_t t01, t23;
uint8x8_t d01, d23;
- s0 = vld1q_u8(src);
- src += src_stride;
- s1 = vld1q_u8(src);
- src += src_stride;
- s2 = vld1q_u8(src);
- src += src_stride;
- s3 = vld1q_u8(src);
- src += src_stride;
-
- t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
d01 = vqrshrun_n_s16(t01, 7);
d23 = vqrshrun_n_s16(t23, 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -166,20 +676,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
- d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
- d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
- d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
@@ -222,20 +730,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
dd01 = vdup_n_u8(0);
dd23 = vdup_n_u8(0);
- s0 = vld1q_u8(src);
- src += src_stride;
- s1 = vld1q_u8(src);
- src += src_stride;
- s2 = vld1q_u8(src);
- src += src_stride;
- s3 = vld1q_u8(src);
- src += src_stride;
-
- t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
- t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
- t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
- t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
d01 = vqrshrun_n_s16(t01, 7);
@@ -243,17 +743,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -268,29 +766,25 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
-
- d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
- d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
- d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
- d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
-
- dd0 = vld1_u8(d + 0 * dst_stride);
- dd1 = vld1_u8(d + 1 * dst_stride);
- dd2 = vld1_u8(d + 2 * dst_stride);
- dd3 = vld1_u8(d + 3 * dst_stride);
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
d0 = vrhadd_u8(d0, dd0);
d1 = vrhadd_u8(d1, dd1);
d2 = vrhadd_u8(d2, dd2);
d3 = vrhadd_u8(d3, dd3);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
d += 8;
@@ -303,6 +797,49 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
@@ -333,14 +870,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int32x4_t d0, d1, d2, d3;
uint8x8_t d01, d23;
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- src += 4 * src_stride;
- t4 = vld1_u8(src);
- src += src_stride;
- t5 = vld1_u8(src);
- src += src_stride;
- t6 = vld1_u8(src);
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -357,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
do {
uint8x8_t t7, t8, t9, t10;
@@ -375,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
/* Merge new data into block from previous iteration. */
samples_LUT.val[0] = s3456;
@@ -384,22 +915,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -409,6 +933,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s3456 = s78910;
src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -426,14 +951,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s += 4 * src_stride;
- t4 = vld1_u8(s);
- s += src_stride;
- t5 = vld1_u8(s);
- s += src_stride;
- t6 = vld1_u8(s);
- s += src_stride;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -450,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
tran_concat_tbl);
do {
@@ -475,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
tran_concat_tbl);
/* Merge new data into block from previous iteration. */
@@ -491,18 +1010,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -556,14 +1073,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
int32x4_t d0, d1, d2, d3;
uint8x8_t d01, d23, dd01, dd23;
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- src += 4 * src_stride;
- t4 = vld1_u8(src);
- src += src_stride;
- t5 = vld1_u8(src);
- src += src_stride;
- t6 = vld1_u8(src);
- src += src_stride;
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -580,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
do {
uint8x8_t t7, t8, t9, t10;
@@ -598,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
/* Merge new data into block from previous iteration. */
samples_LUT.val[0] = s3456;
@@ -607,27 +1118,21 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
- dst += dst_stride;
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -637,6 +1142,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s3456 = s78910;
src += 4 * src_stride;
+ dst += 4 * dst_stride;
h -= 4;
} while (h > 0);
} else {
@@ -654,14 +1160,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s = src;
d = dst;
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s += 4 * src_stride;
- t4 = vld1_u8(s);
- s += src_stride;
- t5 = vld1_u8(s);
- s += src_stride;
- t6 = vld1_u8(s);
- s += src_stride;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
/* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -678,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This operation combines a conventional transpose and the sample permute
* (see horizontal case) required before computing the dot product.
*/
- transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
tran_concat_tbl);
- transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
tran_concat_tbl);
do {
@@ -703,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
- transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
tran_concat_tbl);
/* Merge new data into block from previous iteration. */
@@ -719,28 +1219,23 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
-
- dd0 = vld1_u8(d + 0 * dst_stride);
- dd1 = vld1_u8(d + 1 * dst_stride);
- dd2 = vld1_u8(d + 2 * dst_stride);
- dd3 = vld1_u8(d + 3 * dst_stride);
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
d0 = vrhadd_u8(d0, dd0);
d1 = vrhadd_u8(d1, dd1);
d2 = vrhadd_u8(d2, dd2);
d3 = vrhadd_u8(d3, dd3);
- vst1_u8(d + 0 * dst_stride, d0);
- vst1_u8(d + 1 * dst_stride, d1);
- vst1_u8(d + 2 * dst_stride, d2);
- vst1_u8(d + 3 * dst_stride, d3);
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
/* Prepare block for next iteration - re-using as much as possible. */
/* Shuffle everything up four rows. */
@@ -764,29 +1259,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-#else
-
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
- const uint8x8_t s0, const uint8x8_t s1,
- const uint8x8_t s2, const uint8x8_t s3,
- const uint8x8_t s4, const uint8x8_t s5,
- const uint8x8_t s6, const uint8x8_t s7) {
- vst1_u8(s, s0);
- s += p;
- vst1_u8(s, s1);
- s += p;
- vst1_u8(s, s2);
- s += p;
- vst1_u8(s, s3);
- s += p;
- vst1_u8(s, s4);
- s += p;
- vst1_u8(s, s5);
- s += p;
- vst1_u8(s, s6);
- s += p;
- vst1_u8(s, s7);
-}
+#endif // defined(__ARM_FEATURE_MATMUL_INT8)
+
+#else // !(defined(__aarch64__) &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8)))
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -808,16 +1285,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
if (h == 4) {
uint8x8_t d01, d23;
- int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
- d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
int16x8_t tt0, tt1, tt2, tt3;
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -849,14 +1323,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vget_low_s16(tt2);
s10 = vget_low_s16(tt3);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -883,8 +1353,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
@@ -927,14 +1395,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
@@ -1002,22 +1466,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
- filter4);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
- filter4);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
- filter4);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
- filter3, filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
@@ -1061,8 +1517,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
if (h == 4) {
uint8x8_t d01, d23;
- int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
- d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
int16x8_t tt0, tt1, tt2, tt3;
uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1070,8 +1525,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1103,14 +1556,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s9 = vget_low_s16(tt2);
s10 = vget_low_s16(tt3);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1140,8 +1589,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int width;
const uint8_t *s;
uint8x8_t t4, t5, t6, t7;
@@ -1186,14 +1633,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 5 * src_stride);
__builtin_prefetch(src + 6 * src_stride);
__builtin_prefetch(src + 7 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1276,22 +1719,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
- t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
- filter4);
- t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
- filter4);
- t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
- filter4);
- t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
- filter3, filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1349,8 +1784,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -1387,14 +1820,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1417,8 +1846,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int height;
const uint8_t *s;
uint8_t *d;
@@ -1469,14 +1896,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
vst1_u8(d, t0);
d += dst_stride;
@@ -1521,8 +1944,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1560,14 +1981,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
__builtin_prefetch(src + 3 * src_stride);
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1598,8 +2015,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int height;
const uint8_t *s;
uint8_t *d;
@@ -1651,14 +2066,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
- filter4);
- t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
- filter4);
- t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
- filter4);
- t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
- filter4);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d01 = vcombine_u8(t0, t1);
d23 = vcombine_u8(t2, t3);
@@ -1694,4 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-#endif
+#endif // #if defined(__aarch64__) &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 857b6d54e..ed7f18053 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,69 +16,12 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
- uint8x8_t *const s0, uint8x8_t *const s1,
- uint8x8_t *const s2, uint8x8_t *const s3) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
-}
-
-static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
- uint8x8_t *const s0, uint8x8_t *const s1,
- uint8x8_t *const s2, uint8x8_t *const s3,
- uint8x8_t *const s4, uint8x8_t *const s5,
- uint8x8_t *const s6, uint8x8_t *const s7) {
- *s0 = vld1_u8(s);
- s += p;
- *s1 = vld1_u8(s);
- s += p;
- *s2 = vld1_u8(s);
- s += p;
- *s3 = vld1_u8(s);
- s += p;
- *s4 = vld1_u8(s);
- s += p;
- *s5 = vld1_u8(s);
- s += p;
- *s6 = vld1_u8(s);
- s += p;
- *s7 = vld1_u8(s);
-}
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
- uint8x16_t *const s0, uint8x16_t *const s1,
- uint8x16_t *const s2, uint8x16_t *const s3,
- uint8x16_t *const s4, uint8x16_t *const s5,
- uint8x16_t *const s6, uint8x16_t *const s7) {
- *s0 = vld1q_u8(s);
- s += p;
- *s1 = vld1q_u8(s);
- s += p;
- *s2 = vld1q_u8(s);
- s += p;
- *s3 = vld1q_u8(s);
- s += p;
- *s4 = vld1q_u8(s);
- s += p;
- *s5 = vld1q_u8(s);
- s += p;
- *s6 = vld1q_u8(s);
- s += p;
- *s7 = vld1q_u8(s);
-}
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
- (__ARM_FEATURE_DOTPROD == 1)
-
-static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
/* Sample range-clamping and permutation are performed by the caller. */
int32x4_t sum;
@@ -90,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
return sum;
}
-static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
int8x16_t clamped_samples, permuted_samples[2];
int32x4_t sum;
@@ -115,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
return sum;
}
-static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
/* Sample range-clamping and permutation are performed by the caller. */
int32x4_t sum0, sum1;
int16x8_t sum;
@@ -138,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
return vqrshrun_n_s16(sum, 7);
}
-static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
int8x16_t clamped_samples, permuted_samples[3];
int32x4_t sum0, sum1;
int16x8_t sum;
@@ -171,15 +114,98 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
return vqrshrun_n_s16(sum, 7);
}
-#endif
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filters,
- const int16x4_t filter3,
- const int16x4_t filter4) {
+ const int16x8_t filters) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
int16x4_t sum;
@@ -190,8 +216,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
sum = vmla_lane_s16(sum, s5, filters_hi, 1);
sum = vmla_lane_s16(sum, s6, filters_hi, 2);
sum = vmla_lane_s16(sum, s7, filters_hi, 3);
- sum = vqadd_s16(sum, vmul_s16(s3, filter3));
- sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
return sum;
}
@@ -199,9 +225,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3,
const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filters,
- const int16x8_t filter3,
- const int16x8_t filter4) {
+ const int16x8_t filters) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
int16x8_t sum;
@@ -212,15 +236,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
- sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
- sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
return vqrshrun_n_s16(sum, 7);
}
static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
const int16x8_t filters) {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
int16x8_t ss[8];
ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -233,7 +255,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
- filters, filter3, filter4);
+ filters);
}
#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8edf8a66e..b8e3c5e54 100644
--- a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_ports/mem.h"
@@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4(
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
if (x_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x8_t ss[4];
int16x4_t t[8], tt;
@@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4(
t[7] = vget_high_s16(ss[3]);
tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters, filter3, filter4);
+ filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
} else {
@@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4(
if (y_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x4_t t[8], tt;
@@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4(
t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
- filter3, filter4);
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
} else {
diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c
index 1c45e8a73..954015407 100644
--- a/libvpx/vpx_dsp/avg.c
+++ b/libvpx/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+
+#include <assert.h>
#include <stdlib.h>
#include "./vpx_dsp_rtcd.h"
@@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
const int ref_stride, const int height) {
int idx;
const int norm_factor = height >> 1;
+ assert(height >= 2);
for (idx = 0; idx < 16; ++idx) {
int i;
hbuf[idx] = 0;
diff --git a/libvpx/vpx_dsp/bitwriter.h b/libvpx/vpx_dsp/bitwriter.h
index 04084af8f..5f1ee69ec 100644
--- a/libvpx/vpx_dsp/bitwriter.h
+++ b/libvpx/vpx_dsp/bitwriter.h
@@ -13,6 +13,7 @@
#include <stdio.h>
+#include "vpx_ports/compiler_attributes.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/prob.h"
@@ -35,7 +36,9 @@ typedef struct vpx_writer {
void vpx_start_encode(vpx_writer *br, uint8_t *source);
void vpx_stop_encode(vpx_writer *br);
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+ int bit,
+ int probability) {
unsigned int split;
int count = br->count;
unsigned int range = br->range;
diff --git a/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
index 2fc33b06b..77be0bb4f 100644
--- a/libvpx/vpx_dsp/loongarch/quantize_lsx.c
+++ b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
@@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
}
static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
- __m128i zbin_mask0, __m128i zbin_mask1,
const int16_t *scan, int index,
__m128i zero) {
const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
@@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
__m128i scan1 = __lsx_vld(scan + index + 8, 0);
__m128i eob0, eob1;
- scan0 = __lsx_vsub_h(scan0, zbin_mask0);
- scan1 = __lsx_vsub_h(scan1, zbin_mask1);
eob0 = __lsx_vandn_v(zero_coeff0, scan0);
eob1 = __lsx_vandn_v(zero_coeff1, scan1);
return __lsx_vmax_h(eob0, eob1);
@@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
dequant = __lsx_vilvh_d(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
eob = __lsx_vmax_h(eob, eob0);
index += 16;
@@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
dequant = __lsx_vilvh_d(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
// AC only loop.
for (index = 16; index < 32 * 32; index += 16) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
dqcoeff_ptr + 8 + index);
- eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
eob = __lsx_vmax_h(eob, eob0);
}
diff --git a/libvpx/vpx_dsp/loopfilter.c b/libvpx/vpx_dsp/loopfilter.c
index 995602831..d6504aab1 100644
--- a/libvpx/vpx_dsp/loopfilter.c
+++ b/libvpx/vpx_dsp/loopfilter.c
@@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint8_t *op3, uint8_t *op2, uint8_t *op1,
uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
uint8_t *oq2, uint8_t *oq3) {
@@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint8_t *op7, uint8_t *op6,
uint8_t *op5, uint8_t *op4, uint8_t *op3,
uint8_t *op2, uint8_t *op1, uint8_t *op0,
uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
bd);
}
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint16_t *op3, uint16_t *op2, uint16_t *op1,
uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
bd);
}
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint16_t *op7, uint16_t *op6,
uint16_t *op5, uint16_t *op4, uint16_t *op3,
uint16_t *op2, uint16_t *op1, uint16_t *op0,
uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h
index 3c2f50c79..d54ce5368 100644
--- a/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libvpx/vpx_dsp/mips/macros_msa.h
@@ -83,31 +83,33 @@
val_lh_m; \
})
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
- uint32_t val_lw_m; \
- \
- __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
- "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
- : [val_lw_m] "=&r"(val_lw_m) \
- : [psrc_lw_m] "r"(psrc_lw_m)); \
- \
- val_lw_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+ uint32_t val_lw_m; \
+ \
+ __asm__ __volatile__( \
+ "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+ : [val_lw_m] "=&r"(val_lw_m) \
+ : [psrc_lw_m] "r"(psrc_lw_m)); \
+ \
+ val_lw_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
- uint64_t val_ld_m = 0; \
- \
- __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
- "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
- : [val_ld_m] "=&r"(val_ld_m) \
- : [psrc_ld_m] "r"(psrc_ld_m)); \
- \
- val_ld_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+ uint64_t val_ld_m = 0; \
+ \
+ __asm__ __volatile__( \
+ "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+ : [val_ld_m] "=&r"(val_ld_m) \
+ : [psrc_ld_m] "r"(psrc_ld_m)); \
+ \
+ val_ld_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
diff --git a/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libvpx/vpx_dsp/ppc/quantize_vsx.c
index 7cdcbeb40..ab71f6e23 100644
--- a/libvpx/vpx_dsp/ppc/quantize_vsx.c
+++ b/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
}
-static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
const int16_t *iscan_ptr, int index) {
int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
- scan = vec_sub(scan, mask);
return vec_andc(scan, zero_coeff);
}
@@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
- eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
- nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
if (n_coeffs > 16) {
int index = 16;
@@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
- eob =
- vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
- eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
- nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
eob = vec_max(eob, eob2);
index += 24;
@@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = vec_splat(dequant, 1); // remove DC from dequant
vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
- eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
- nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+ eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+ nonzero_scanindex(qcoeff1, iscan_ptr, 16));
do {
int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
@@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
- eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
- eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
- nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+ eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+ eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+ nonzero_scanindex(qcoeff2, iscan_ptr, off2));
eob = vec_max(eob, eob2);
// 24 int16_t is 48 bytes
diff --git a/libvpx/vpx_dsp/psnr.c b/libvpx/vpx_dsp/psnr.c
index 48bac0450..f0d4e927a 100644
--- a/libvpx/vpx_dsp/psnr.c
+++ b/libvpx/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
* and highbd_8_variance(). It should not.
*/
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, unsigned int *sse,
- int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
int i, j;
-
- *sum = 0;
- *sse = 0;
+ int64_t sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
+ sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
+
+ return sse;
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h) {
int i, j;
+ int64_t sse = 0;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
+ sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
-}
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
- &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
+ return sse;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
const int dw = width % 16;
const int dh = height % 16;
int64_t total_sse = 0;
- unsigned int sse = 0;
- int sum = 0;
int x, y;
if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
- height, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height);
}
if (dh > 0) {
- encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride, width - dw, dh,
- &sse, &sum);
- total_sse += sse;
+ total_sse +=
+ encoder_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
+ unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
@@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
int x, y;
const int dw = width % 16;
const int dh = height % 16;
- unsigned int sse = 0;
- int sum = 0;
if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
- b_stride, dw, height, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height);
}
if (dh > 0) {
- encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
+ total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
+ unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c
index 30b55dcb4..ce1e8382b 100644
--- a/libvpx/vpx_dsp/variance.c
+++ b/libvpx/vpx_dsp/variance.c
@@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
- int width, int height, const uint16_t *ref,
- int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 13999af04..1fd9495cf 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -226,19 +226,19 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(VPX_ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
-DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c
endif # !CONFIG_VP9_HIGHBITDEPTH
@@ -326,11 +326,14 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c
DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c
DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c
endif
# avg
@@ -374,6 +377,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c
DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
@@ -388,6 +392,9 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS
@@ -425,6 +432,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d3c668f9a..8725821b6 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -527,6 +527,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2 neon/;
+ specialize qw/vpx_highbd_fdct4x4_1 neon/;
+ $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8 neon sse2/;
@@ -550,27 +552,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct32x32_1 sse2 neon/;
add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct4x4 sse2/;
+ specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct8x8 sse2/;
+ specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct8x8_1 neon/;
$vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct16x16 sse2/;
+ specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct16x16_1 neon/;
add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32 sse2/;
+ specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+ specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vpx_highbd_fdct32x32_1 neon/;
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
@@ -711,17 +715,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
+ specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
+ specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b sse2/;
+ specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+ specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER
@@ -730,7 +734,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
#
# Single block SAD
@@ -795,7 +799,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_highbd_hadamard_8x8 avx2/;
@@ -819,7 +823,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon msa/;
@@ -935,46 +939,49 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vpx_highbd_subtract_block neon avx2/;
#
# Single block SAD
#
add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad64x64 sse2/;
+ specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad64x32 sse2/;
+ specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x64 sse2/;
+ specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x32 sse2/;
+ specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad32x16 sse2/;
+ specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x32 sse2/;
+ specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x16 sse2/;
+ specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad16x8 sse2/;
+ specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x16 sse2/;
+ specialize qw/vpx_highbd_sad8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x8 sse2/;
+ specialize qw/vpx_highbd_sad8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vpx_highbd_sad8x4 sse2/;
+ specialize qw/vpx_highbd_sad8x4 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x8 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x4 neon/;
#
# Avg
@@ -988,83 +995,85 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad64x64_avg sse2/;
+ specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad64x32_avg sse2/;
+ specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x64_avg sse2/;
+ specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x32_avg sse2/;
+ specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad32x16_avg sse2/;
+ specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x32_avg sse2/;
+ specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x16_avg sse2/;
+ specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad16x8_avg sse2/;
+ specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x16_avg sse2/;
+ specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x8_avg sse2/;
+ specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vpx_highbd_sad8x4_avg sse2/;
+ specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x8_avg neon/;
add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x4_avg neon/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x64x4d sse2/;
+ specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x32x4d sse2/;
+ specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x64x4d sse2/;
+ specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x32x4d sse2/;
+ specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad32x16x4d sse2/;
+ specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x32x4d sse2/;
+ specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x16x4d sse2/;
+ specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad16x8x4d sse2/;
+ specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x16x4d sse2/;
+ specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x8x4d sse2/;
+ specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad8x4x4d sse2/;
+ specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad4x8x4d sse2/;
+ specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad4x4x4d sse2/;
+ specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
#
# Structured Similarity (SSIM)
@@ -1232,369 +1241,397 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_variance4x4 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_variance4x4 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance8x4 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x8 neon/;
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_variance4x4 neon/;
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get16x16var sse2/;
+ specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get8x8var sse2/;
+ specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get16x16var sse2/;
+ specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get8x8var sse2/;
+ specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get16x16var sse2/;
+ specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get8x8var sse2/;
+ specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x16 sse2/;
+ specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_8_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x8 sse2/;
+ specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x16 sse2/;
+ specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_10_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x8 sse2/;
+ specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x16 sse2/;
+ specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse16x8 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vpx_highbd_12_mse8x16 neon/;
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x8 sse2/;
+ specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+ specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
#
# Subpixel Variance
#
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
} # CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
index 3f4f577a2..b2e01319d 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
- src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
src32[0] = _mm256_cvtepi16_epi32(src16[0]);
src32[1] = _mm256_cvtepi16_epi32(src16[1]);
@@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
- src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
hadamard_col8x2_avx2(src, 0);
hadamard_col8x2_avx2(src, 1);
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 9da2f34c9..015c11a1f 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
s0 = _mm_add_epi32(s0, s1);
s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
- avg = _mm_cvtsi128_si32(s0);
+ avg = (unsigned int)_mm_cvtsi128_si32(s0);
return (avg + 32) >> 6;
}
@@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h
index 99bc9637f..ebee964b1 100644
--- a/libvpx/vpx_dsp/x86/convolve_avx2.h
+++ b/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
__m128i *const dst_ptr_2,
const __m256i *const src) {
- *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
- *((uint32_t *)(dst_ptr_2)) =
- _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+ *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+ *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
}
static INLINE __m256i mm256_round_epi32(const __m256i *const src,
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 3f158b5e4..f3a802029 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- const __m256i kZero = _mm256_set1_epi16(0);
+ const __m256i kZero = _mm256_setzero_si256();
const __m256i kOne = _mm256_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index ac1246faa..bf350b6da 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index 78cf9111d..1d07391b0 100644
--- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
const int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
// Faster than _mm_set1_epi16((1 << bd) - 1).
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index d265fc1a9..9f45623de 100644
--- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
__m128i lbounded;
__m128i retval;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
__m128i t80, max, min;
@@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
__m128i blimit_v, limit_v, thresh_v;
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
@@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
__m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
@@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 000000000..8edddd637
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr,
+ __m256i *qp, int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (_mm256_movemask_epi8(zbin_mask) == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const unsigned int step = 8;
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ (void)scan;
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 4535a0f7a..ae1981a83 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
__m128i zbins[2];
__m128i nzbins[2];
@@ -82,13 +82,14 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
const uint32_t abs_qcoeff =
(uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
- *eob_ptr = eob_i + 1;
+ *eob_ptr = eob_i;
}
void vpx_highbd_quantize_b_32x32_sse2(
@@ -101,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2(
__m128i nzbins[2];
int idx = 0;
int idx_arr[1024];
- int i, eob = -1;
+ int i, eob = 0;
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
@@ -143,10 +144,10 @@ void vpx_highbd_quantize_b_32x32_sse2(
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
- *eob_ptr = eob + 1;
+ *eob_ptr = eob;
}
#endif
diff --git a/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 000000000..947b5e977
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t sad_array[4]) {
+ const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+ const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+ const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+ _mm256_extractf128_si256(t2, 1));
+ _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ int x;
+
+ for (x = 0; x < 4; ++x) {
+ __m256i r[4];
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+ // sum every abs diff
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+ }
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD64XNX4D(n) \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *refs[4]; \
+ __m256i sums_16[4]; \
+ __m256i sums_32[4]; \
+ int i; \
+ \
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
+ sums_32[0] = _mm256_setzero_si256(); \
+ sums_32[1] = _mm256_setzero_si256(); \
+ sums_32[2] = _mm256_setzero_si256(); \
+ sums_32[3] = _mm256_setzero_si256(); \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ sums_16[0] = _mm256_setzero_si256(); \
+ sums_16[1] = _mm256_setzero_si256(); \
+ sums_16[2] = _mm256_setzero_si256(); \
+ sums_16[3] = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32[0] = _mm256_add_epi32( \
+ sums_32[0], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[0], 1)))); \
+ sums_32[1] = _mm256_add_epi32( \
+ sums_32[1], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[1], 1)))); \
+ sums_32[2] = _mm256_add_epi32( \
+ sums_32[2], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[2], 1)))); \
+ sums_32[3] = _mm256_add_epi32( \
+ sums_32[3], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[3], 1)))); \
+ \
+ src += src_stride << 1; \
+ } \
+ calc_final_4(sums_32, sad_array); \
+ }
+
+// 64x64
+HIGHBD_SAD64XNX4D(64)
+
+// 64x32
+HIGHBD_SAD64XNX4D(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[8];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+ r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+ r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+ r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+ r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+ r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+ r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+ r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+ sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+ sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+ sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD32XNX4D(n) \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *refs[4]; \
+ __m256i sums_16[4]; \
+ __m256i sums_32[4]; \
+ int i; \
+ \
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
+ sums_32[0] = _mm256_setzero_si256(); \
+ sums_32[1] = _mm256_setzero_si256(); \
+ sums_32[2] = _mm256_setzero_si256(); \
+ sums_32[3] = _mm256_setzero_si256(); \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ sums_16[0] = _mm256_setzero_si256(); \
+ sums_16[1] = _mm256_setzero_si256(); \
+ sums_16[2] = _mm256_setzero_si256(); \
+ sums_16[3] = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32[0] = _mm256_add_epi32( \
+ sums_32[0], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[0], 1)))); \
+ sums_32[1] = _mm256_add_epi32( \
+ sums_32[1], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[1], 1)))); \
+ sums_32[2] = _mm256_add_epi32( \
+ sums_32[2], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[2], 1)))); \
+ sums_32[3] = _mm256_add_epi32( \
+ sums_32[3], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[3], 1)))); \
+ \
+ src += src_stride << 3; \
+ } \
+ calc_final_4(sums_32, sad_array); \
+ }
+
+// 32x64
+HIGHBD_SAD32XNX4D(64)
+
+// 32x32
+HIGHBD_SAD32XNX4D(32)
+
+// 32x16
+HIGHBD_SAD32XNX4D(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; i++) {
+ __m256i r[4];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src);
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+ // sum every abs diff
+ sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+ sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+ sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+ sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < 2; ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 4;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4]) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ {
+ __m256i sums_32[4];
+ sums_32[0] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+ sums_32[1] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+ sums_32[2] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+ sums_32[3] = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+ calc_final_4(sums_32, sad_array);
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 000000000..231b67f80
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+ const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+ const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+ const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+ _mm256_extractf128_si256(t1, 1));
+ return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN(64)
+
+// 64x32
+HIGHBD_SAD64XN(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN(64)
+
+// 32x32
+HIGHBD_SAD32XN(32)
+
+// 32x16
+HIGHBD_SAD32XN(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+ const uint16_t *src, int src_stride,
+ uint16_t *ref, int ref_stride,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+ const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+ const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+ const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+ // sum every abs diff
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+ *sums_16 =
+ _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 64;
+ }
+}
+
+#define HIGHBD_SAD64XN_AVG(n) \
+ unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 1; \
+ ref += ref_stride << 1; \
+ sec += 64 << 1; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref/pred avg to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride;
+ ref += ref_stride;
+ sec += 32;
+ }
+}
+
+#define HIGHBD_SAD32XN_AVG(n) \
+ unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
+ __m256i sums_32 = _mm256_setzero_si256(); \
+ int i; \
+ \
+ for (i = 0; i < (n / 8); ++i) { \
+ __m256i sums_16 = _mm256_setzero_si256(); \
+ \
+ highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+ \
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32 = _mm256_add_epi32( \
+ sums_32, \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+ \
+ src += src_stride << 3; \
+ ref += ref_stride << 3; \
+ sec += 32 << 3; \
+ } \
+ return calc_final(sums_32); \
+ }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+ const uint16_t *src,
+ int src_stride, uint16_t *ref,
+ int ref_stride, uint16_t *sec,
+ int height) {
+ int i;
+ for (i = 0; i < height; i += 2) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+ const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+ const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+ // absolute differences between every ref[] to src
+ const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+ const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+ // sum every abs diff
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+ *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
+ sec += 32;
+ }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 4;
+ ref += ref_stride << 4;
+ sec += 16 << 4;
+ }
+ return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+ {
+ const __m256i sums_32 = _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+ return calc_final(sums_32);
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index 7c8d79b09..381e0ad19 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
#undef FNS
#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index 4b02da966..f42b3df84 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) {
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
__m128i s[8], u[16], v[8], w[16];
// transpose
@@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kZero = _mm_setzero_si128();
u[0] = _mm_unpacklo_epi16(in[15], in[0]);
u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libvpx/vpx_dsp/x86/loopfilter_avx2.c
index be391992a..a58fb6553 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
const unsigned char *limit,
const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
__m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
__m128i abs_p1p0;
@@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
const unsigned char *limit,
const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index 347c9fdbe..6ea34cdd1 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i limit_v =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
_mm_loadl_epi64((const __m128i *)limit));
@@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i limit_v =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
_mm_loadl_epi64((const __m128i *)limit));
@@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
@@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2(
DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit =
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
_mm_load_si128((const __m128i *)blimit1));
@@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
const __m128i thresh =
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
_mm_load_si128((const __m128i *)thresh1));
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
__m128i mask, hev, flat;
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
index 8b6d4d1dd..031f361a4 100644
--- a/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -27,13 +27,13 @@ static INLINE int32_t loadu_int32(const void *src) {
}
static INLINE __m128i load_unaligned_u32(const void *a) {
- uint32_t val;
+ int val;
memcpy(&val, a, sizeof(val));
return _mm_cvtsi32_si128(val);
}
static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
- const uint32_t val = _mm_cvtsi128_si32(v);
+ const int val = _mm_cvtsi128_si32(v);
memcpy(a, &val, sizeof(val));
}
diff --git a/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libvpx/vpx_dsp/x86/post_proc_sse2.c
index d1029afc4..119fa7cd1 100644
--- a/libvpx/vpx_dsp/x86/post_proc_sse2.c
+++ b/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
__m128i s = _mm_loadl_epi64((__m128i *)dst);
__m128i sum, sumsq_0, sumsq_1;
__m128i tmp_0, tmp_1;
- __m128i below_context;
+ __m128i below_context = _mm_setzero_si128();
s = _mm_unpacklo_epi8(s, zero);
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
index 706e4e641..7d8352721 100644
--- a/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
@@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
diff --git a/libvpx/vpx_dsp/x86/quantize_avx2.c b/libvpx/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 000000000..28f7c9c7d
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift, int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+#else
+ // typedef int16_t tran_low_t;
+ return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // typedef int32_t tran_low_t;
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+ // typedef int16_t tran_low_t;
+ _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return _mm256_setzero_si256();
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+ const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+ const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+ return v_nz_mask;
+ }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+ // Do DC and first 15 AC.
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+ __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+ __m256i *v_quant_shift, __m256i *v_eobmax) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+ return *v_eobmax;
+ }
+ {
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >> 15);
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi =
+ _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+ const __m256i v_tmp32_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_sign_lo =
+ _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i v_sign_hi =
+ _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+ const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+ const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+ const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+ const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+ _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+ const __m256i v_nz_mask =
+ _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+ store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+ dqcoeff_ptr);
+#endif
+
+ return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+ }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+ intptr_t count;
+ (void)n_coeffs;
+ (void)scan;
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 1);
+
+ // Do DC and first 15 AC.
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (count = (32 * 32) - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, &v_eobmax);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index 459d95f28..9533e7916 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
@@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.h b/libvpx/vpx_dsp/x86/quantize_sse2.h
index afe2f924b..27bfb4e41 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.h
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
*shift = _mm_load_si128((const __m128i *)shift_ptr);
}
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr,
+ __m128i *dequant) {
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
// With ssse3 and later abs() and sign() are preferred.
static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
a = _mm_xor_si128(a, sign);
@@ -62,11 +71,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
-// zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan.
static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
- const __m128i zbin_mask0,
- const __m128i zbin_mask1,
const int16_t *scan, const int index,
const __m128i zero) {
const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
@@ -74,9 +80,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
__m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
__m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
__m128i eob0, eob1;
- // Add one to convert from indices to counts
- scan0 = _mm_sub_epi16(scan0, zbin_mask0);
- scan1 = _mm_sub_epi16(scan1, zbin_mask1);
eob0 = _mm_andnot_si128(zero_coeff0, scan0);
eob1 = _mm_andnot_si128(zero_coeff1, scan1);
return _mm_max_epi16(eob0, eob1);
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
index 9d2a88b7b..476230286 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
@@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
@@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
}
// AC only loop.
@@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
dqcoeff_ptr + 8 + index);
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
eob = _mm_max_epi16(eob, eob0);
}
diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
index 3b48acd51..29bedb0e6 100644
--- a/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -14,7 +14,7 @@
#define FSAD64_H(h) \
unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -35,8 +35,7 @@
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSAD32_H(h) \
@@ -92,7 +91,7 @@ FSAD32
unsigned int vpx_sad64x##h##_avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -118,15 +117,14 @@ FSAD32
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSADAVG32_H(h) \
unsigned int vpx_sad32x##h##_avg_avx2( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
+ int i; \
__m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
__m256i sum_sad = _mm256_setzero_si256(); \
__m256i sum_sad_h; \
@@ -156,8 +154,7 @@ FSAD32
sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
}
#define FSADAVG64 \
diff --git a/libvpx/vpx_dsp/x86/subtract_avx2.c b/libvpx/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4849581ed
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+ const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+ if (cols == 64) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+ const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+ const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ const __m256i d2 = _mm256_sub_epi16(s2, p2);
+ const __m256i d3 = _mm256_sub_epi16(s3, p3);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 32) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ } while (--j != 0);
+ } else if (cols == 16) {
+ int j = rows;
+ do {
+ const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i s1 =
+ _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+ const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i p1 =
+ _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+ const __m256i d0 = _mm256_sub_epi16(s0, p0);
+ const __m256i d1 = _mm256_sub_epi16(s1, p1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else if (cols == 8) {
+ int j = rows;
+ do {
+ const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storeu_si128((__m128i *)diff_ptr, d0);
+ _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ } else {
+ int j = rows;
+ assert(cols == 4);
+ do {
+ const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ const __m128i s1 =
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+ const __m128i p1 =
+ _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+ const __m128i d0 = _mm_sub_epi16(s0, p0);
+ const __m128i d1 = _mm_sub_epi16(s1, p1);
+ _mm_storel_epi64((__m128i *)diff_ptr, d0);
+ _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+ src_ptr += src_stride << 1;
+ pred_ptr += pred_stride << 1;
+ diff_ptr += diff_stride << 1;
+ j -= 2;
+ } while (j != 0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libvpx/vpx_dsp/x86/sum_squares_sse2.c
index 14f3b35c0..df6514b2c 100644
--- a/libvpx/vpx_dsp/x86/sum_squares_sse2.c
+++ b/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
} else {
// Generic case
int r = size;
- const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
__m128i v_acc_q = _mm_setzero_si128();
assert(size % 8 == 0);
diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c
index 9232acbfb..35925d590 100644
--- a/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
return sum;
}
-static unsigned int sub_pixel_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
NULL, 0, 0, height, sse);
}
-static unsigned int sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
- int second_stride, int height, unsigned int *sse) {
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ const uint8_t *second_pred,
+ int second_stride, int height,
+ unsigned int *sse) {
return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
second_pred, second_stride, 1, height, sse);
}
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index a67c92aad..d6eb12da1 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -19,7 +19,7 @@
static INLINE unsigned int add32x4_sse2(__m128i val) {
val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
- return _mm_cvtsi128_si32(val);
+ return (unsigned int)_mm_cvtsi128_si32(val);
}
unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
@@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_unpacklo_epi16(vsum, vsum);
vsum = _mm_srai_epi32(vsum, 16);
- *sum = add32x4_sse2(vsum);
+ *sum = (int)add32x4_sse2(vsum);
}
static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
@@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
// Can handle 1024 pixels' diff sum (such as 32x32)
static INLINE int sum_final_sse2(const __m128i sum) {
const __m128i t = sum_to_32bit_sse2(sum);
- return add32x4_sse2(t);
+ return (int)add32x4_sse2(t);
}
static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
@@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
}
@@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
}
@@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
}
*sse = add32x4_sse2(vsse);
- sum = add32x4_sse2(vsum);
+ sum = (int)add32x4_sse2(vsum);
return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
}
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 0cbd151dc..21a35ae3c 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
// Saturate and convert to 8-bit words
dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
src_ptr += src_stride;
dst_ptr += dst_stride;
@@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
// Save only half of the register (8 words)
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
- *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
// Update the source by two rows
src_ptr += src_stride_unrolled;
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 6f2983a4b..c7d880860 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
}
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
for (i = output_height; i > 1; i -= 2) {
__m256i srcRegHead2, srcRegHead3;
@@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
s2[2] = s2[3];
srcRegHead1 = srcRegHead3;
}
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- // load the last 16 bytes
- const __m128i srcRegHead2 =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- s1[0] = _mm256_castsi128_si256(
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
- s2[0] = _mm256_castsi128_si256(
- _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
- outReg1 = convolve8_8_avx2(s1, f);
- outReg2 = convolve8_8_avx2(s2, f);
-
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
- outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
- // average if necessary
- if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- }
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
- }
}
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
@@ -798,7 +772,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
// Pack to 8-bits
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
}
}
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index ed46d6245..4ea2752d3 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
// Pack to 8-bits
dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
src_ptr += src_stride;
dst_ptr += dst_stride;
@@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
reg_1 = _mm_packus_epi16(reg_1, reg_1);
// Save the result
- *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
- *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+ *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+ *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
// Update the source by two rows
src_ptr += src_stride_unrolled;