aboutsummaryrefslogtreecommitdiff
path: root/vpx_dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp/x86')
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c53
-rw-r--r--vpx_dsp/x86/avg_intrin_sse2.c53
-rw-r--r--vpx_dsp/x86/avg_pred_avx2.c111
-rw-r--r--vpx_dsp/x86/fwd_txfm_avx2.c373
-rw-r--r--vpx_dsp/x86/highbd_quantize_intrin_avx2.c44
-rw-r--r--vpx_dsp/x86/highbd_quantize_intrin_sse2.c40
-rw-r--r--vpx_dsp/x86/highbd_sad4d_avx2.c313
-rw-r--r--vpx_dsp/x86/highbd_sad4d_sse2.asm43
-rw-r--r--vpx_dsp/x86/highbd_sad_avx2.c188
-rw-r--r--vpx_dsp/x86/highbd_sad_sse2.asm59
-rw-r--r--vpx_dsp/x86/inv_txfm_avx2.c626
-rw-r--r--vpx_dsp/x86/quantize_avx.c51
-rw-r--r--vpx_dsp/x86/quantize_avx2.c53
-rw-r--r--vpx_dsp/x86/quantize_sse2.c17
-rw-r--r--vpx_dsp/x86/quantize_sse2.h51
-rw-r--r--vpx_dsp/x86/quantize_ssse3.c51
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c66
-rw-r--r--vpx_dsp/x86/sad4d_sse2.asm43
-rw-r--r--vpx_dsp/x86/sad_avx2.c145
-rw-r--r--vpx_dsp/x86/sad_sse2.asm70
-rw-r--r--vpx_dsp/x86/sse_avx2.c367
-rw-r--r--vpx_dsp/x86/sse_sse4.c312
-rw-r--r--vpx_dsp/x86/subtract_sse2.asm1
-rw-r--r--vpx_dsp/x86/variance_avx2.c80
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c591
25 files changed, 3257 insertions, 544 deletions
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index b2e01319d..61e4e73c5 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+ __m256i *out_lo,
+ __m256i *out_hi) {
+ const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+ *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
static void hadamard_col8x2_avx2(__m256i *in, int iter) {
__m256i a0 = in[0];
__m256i a1 = in[1];
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *t_coeff = coeff;
#endif
int idx;
+ __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m256i b0, b1, b2, b3;
+ const __m256i zero = _mm256_setzero_si256();
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
const int16_t *src_ptr =
@@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
- __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
- __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
- __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
- __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm256_srai_epi32(b0_lo, 2);
+ b1_lo = _mm256_srai_epi32(b1_lo, 2);
+ b2_lo = _mm256_srai_epi32(b2_lo, 2);
+ b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm256_srai_epi32(b0_hi, 2);
+ b1_hi = _mm256_srai_epi32(b1_hi, 2);
+ b2_hi = _mm256_srai_epi32(b2_hi, 2);
+ b3_hi = _mm256_srai_epi32(b3_hi, 2);
- b0 = _mm256_srai_epi16(b0, 2);
- b1 = _mm256_srai_epi16(b1, 2);
- b2 = _mm256_srai_epi16(b2, 2);
- b3 = _mm256_srai_epi16(b3, 2);
+ b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm256_packs_epi32(b3_lo, b3_hi);
store_tran_low(_mm256_add_epi16(b0, b2), coeff);
store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 015c11a1f..4447dfab7 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,14 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+ __m128i *out_lo,
+ __m128i *out_hi) {
+ const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+ *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
int *min, int *max) {
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *t_coeff = coeff;
#endif
int idx;
+ __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m128i b0, b1, b2, b3;
+ const __m128i zero = _mm_setzero_si128();
for (idx = 0; idx < 4; ++idx) {
const int16_t *src_ptr =
src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
@@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
__m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
__m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
- __m128i b0 = _mm_add_epi16(coeff0, coeff1);
- __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
- __m128i b2 = _mm_add_epi16(coeff2, coeff3);
- __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm_srai_epi32(b0_lo, 2);
+ b1_lo = _mm_srai_epi32(b1_lo, 2);
+ b2_lo = _mm_srai_epi32(b2_lo, 2);
+ b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm_srai_epi32(b0_hi, 2);
+ b1_hi = _mm_srai_epi32(b1_hi, 2);
+ b2_hi = _mm_srai_epi32(b2_hi, 2);
+ b3_hi = _mm_srai_epi32(b3_hi, 2);
- b0 = _mm_srai_epi16(b0, 2);
- b1 = _mm_srai_epi16(b1, 2);
- b2 = _mm_srai_epi16(b2, 2);
- b3 = _mm_srai_epi16(b3, 2);
+ b0 = _mm_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm_packs_epi32(b3_lo, b3_hi);
coeff0 = _mm_add_epi16(b0, b2);
coeff1 = _mm_add_epi16(b1, b3);
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 000000000..f4357998c
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int row = 0;
+ // comp_pred and pred must be 32 byte aligned.
+ assert(((intptr_t)comp_pred % 32) == 0);
+ assert(((intptr_t)pred % 32) == 0);
+
+ if (width == 8) {
+ assert(height % 4 == 0);
+ do {
+ const __m256i p = _mm256_load_si256((const __m256i *)pred);
+ const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i r_1 =
+ _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+ const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+ const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+ const __m256i ref_0123 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+ const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+ _mm256_store_si256((__m256i *)comp_pred, avg);
+
+ row += 4;
+ pred += 32;
+ comp_pred += 32;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 16) {
+ assert(height % 4 == 0);
+ do {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+ const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+ const __m256i tmp0 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+ const __m256i ref_0 = _mm256_inserti128_si256(
+ tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+ const __m256i tmp1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+ const __m256i ref_1 = _mm256_inserti128_si256(
+ tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)comp_pred, average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 4;
+ pred += 64;
+ comp_pred += 64;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 32) {
+ assert(height % 2 == 0);
+ do {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+ const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)comp_pred, average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 2;
+ pred += 64;
+ comp_pred += 64;
+ ref += 2 * ref_stride;
+ } while (row < height);
+ } else if (width % 64 == 0) {
+ do {
+ int x;
+ for (x = 0; x < width; x += 64) {
+ const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+ const __m256i pred_1 =
+ _mm256_load_si256((const __m256i *)(pred + x + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+ _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+ }
+ row++;
+ pred += width;
+ comp_pred += width;
+ ref += ref_stride;
+ } while (row < height);
+ } else {
+ vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+ }
+}
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index a2ed420e3..c8f54a49c 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,9 +8,382 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <immintrin.h> // AVX2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size, int pass) {
+ int i;
+ const __m256i kOne = _mm256_set1_epi16(1);
+ if (pass == 0) {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+ // x = x << 2
+ out[i] = _mm256_slli_epi16(out[i], 2);
+ }
+ } else {
+ for (i = 0; i < out_size; i++) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+ // x = (x + 1) >> 2
+ out[i] = _mm256_add_epi16(out[i], kOne);
+ out[i] = _mm256_srai_epi16(out[i], 2);
+ }
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + idx] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + idx] = _mm256_inserti128_si256( \
+ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ tran_low_t *out,
+ const int stride,
+ const int out_size) {
+ int i;
+ for (i = 0; i < out_size; ++i) {
+ _mm256_storeu_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+ const __m256i *pin1,
+ const __m256i *pmultiplier,
+ const __m256i *prounding,
+ const int shift) {
+ const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+ const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+ const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+ const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+ const __m256i w0 = _mm256_srai_epi32(v0, shift);
+ const __m256i w1 = _mm256_srai_epi32(v1, shift);
+ return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+ int i;
+ __m256i step2[4];
+ __m256i in[8];
+ __m256i step1[8];
+ __m256i step3[8];
+
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+ const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+ const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+ // Calculate input for the first 8 results.
+ for (i = 0; i < 8; i++) {
+ in[i] = ADD256_EPI16(input[i], input[15 - i]);
+ }
+
+ // Calculate input for the next 8 results.
+ for (i = 0; i < 8; i++) {
+ step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+ }
+
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+ const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+ const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+ const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+ const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+ const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+ const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+ const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m256i r0 = ADD256_EPI16(q0, q3);
+ const __m256i r1 = ADD256_EPI16(q1, q2);
+ const __m256i r2 = SUB256_EPI16(q1, q2);
+ const __m256i r3 = SUB256_EPI16(q0, q3);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+ const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+ const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+ const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+ output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[12] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+ const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+ const __m256i r0 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ const __m256i r1 = mult256_round_shift(
+ &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+ {
+ // Add/subtract
+ const __m256i x0 = ADD256_EPI16(q4, r0);
+ const __m256i x1 = SUB256_EPI16(q4, r0);
+ const __m256i x2 = SUB256_EPI16(q7, r1);
+ const __m256i x3 = ADD256_EPI16(q7, r1);
+
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+ const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+ const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+ const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+ output[2] =
+ mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[14] =
+ mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[10] =
+ mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[6] =
+ mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ { // step 2
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 3
+ {
+ step3[0] = ADD256_EPI16(step1[0], step2[1]);
+ step3[1] = ADD256_EPI16(step1[1], step2[0]);
+ step3[2] = SUB256_EPI16(step1[1], step2[0]);
+ step3[3] = SUB256_EPI16(step1[0], step2[1]);
+ step3[4] = SUB256_EPI16(step1[7], step2[3]);
+ step3[5] = SUB256_EPI16(step1[6], step2[2]);
+ step3[6] = ADD256_EPI16(step1[6], step2[2]);
+ step3[7] = ADD256_EPI16(step1[7], step2[3]);
+ }
+ // step 4
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+ step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ // step 5
+ {
+ step1[0] = ADD256_EPI16(step3[0], step2[0]);
+ step1[1] = SUB256_EPI16(step3[0], step2[0]);
+ step1[2] = ADD256_EPI16(step3[3], step2[1]);
+ step1[3] = SUB256_EPI16(step3[3], step2[1]);
+ step1[4] = SUB256_EPI16(step3[4], step2[3]);
+ step1[5] = ADD256_EPI16(step3[4], step2[3]);
+ step1[6] = SUB256_EPI16(step3[7], step2[2]);
+ step1[7] = ADD256_EPI16(step3[7], step2[2]);
+ }
+ // step 6
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+ output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ {
+ const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+ const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+ const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+ const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+ output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ }
+ }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ const int width = 16;
+ const int height = 16;
+ __m256i buf0[16], buf1[16];
+
+ // Two transform and transpose passes
+ // Process 16 columns (transposed rows in second pass) at a time.
+ for (pass = 0; pass < 2; ++pass) {
+ // Load and pre-condition input.
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+ // Calculate dct for 16x16 values
+ fdct16x16_1D_avx2(buf1, buf0);
+
+ // Transpose the results.
+ transpose_16bit_16x16_avx2(buf0, buf1);
+
+ if (pass == 0) {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+ } else {
+ store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+ }
+ // Setup in/out for next pass.
+ input = intermediate;
+ }
+}
+
#if !CONFIG_VP9_HIGHBITDEPTH
#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
#define FDCT32x32_HIGH_PRECISION 0
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 8edddd637..35ca55404 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,8 @@
#include <immintrin.h>
#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
const __m128i sign = _mm_srai_epi16(*p, 15);
@@ -26,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
}
}
-static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *dequant_ptr,
- const int16_t *quant_shift_ptr,
- __m256i *qp, int log_scale) {
- const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
- const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
- const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+static VPX_FORCE_INLINE void init_qp(
+ const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr,
+ __m256i *qp, int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin);
+ const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant);
const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
- const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ const __m128i quant_shift =
+ _mm_loadu_si128((const __m128i *)mb_plane->quant_shift);
init_one_qp(&zbin, &qp[0]);
init_one_qp(&round, &qp[1]);
init_one_qp(&quant, &qp[2]);
@@ -134,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp,
}
void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const int step = 8;
__m256i eob = _mm256_setzero_si256();
__m256i qp[5];
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
- init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+ init_qp(mb_plane, dequant_ptr, qp, 0);
quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
@@ -222,17 +219,16 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
}
void vpx_highbd_quantize_b_32x32_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
const unsigned int step = 8;
+ intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
__m256i eob = _mm256_setzero_si256();
__m256i qp[5];
- (void)scan;
- init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+ init_qp(mb_plane, dequant_ptr, qp, 1);
quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index ae1981a83..adae60756 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,19 +15,22 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
-#if CONFIG_VP9_HIGHBITDEPTH
void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
__m128i zbins[2];
__m128i nzbins[2];
+ const int16_t *iscan = scan_order->iscan;
+ const int16_t *zbin_ptr = mb_plane->zbin;
+ const int16_t *round_ptr = mb_plane->round;
+ const int16_t *quant_ptr = mb_plane->quant;
+ const int16_t *quant_shift_ptr = mb_plane->quant_shift;
zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
(int)zbin_ptr[0]);
@@ -38,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
- (void)scan;
-
memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
@@ -93,19 +94,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
}
void vpx_highbd_quantize_b_32x32_sse2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
__m128i zbins[2];
__m128i nzbins[2];
int idx = 0;
int idx_arr[1024];
int i, eob = 0;
- const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
- const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
- (void)scan;
+ const intptr_t n_coeffs = 32 * 32;
+ const int16_t *iscan = scan_order->iscan;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
zbins[1] = _mm_set1_epi32(zbin1_tmp);
@@ -140,14 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2(
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
*eob_ptr = eob;
}
-#endif
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 947b5e977..e483fdce7 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
}
}
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 2); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 1;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
#define HIGHBD_SAD64XNX4D(n) \
- void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[4], \
int ref_stride, uint32_t sad_array[4]) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *refs[4]; \
- __m256i sums_16[4]; \
- __m256i sums_32[4]; \
- int i; \
- \
- refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
- refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
- refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
- refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
- sums_32[0] = _mm256_setzero_si256(); \
- sums_32[1] = _mm256_setzero_si256(); \
- sums_32[2] = _mm256_setzero_si256(); \
- sums_32[3] = _mm256_setzero_si256(); \
- \
- for (i = 0; i < (n / 2); ++i) { \
- sums_16[0] = _mm256_setzero_si256(); \
- sums_16[1] = _mm256_setzero_si256(); \
- sums_16[2] = _mm256_setzero_si256(); \
- sums_16[3] = _mm256_setzero_si256(); \
- \
- highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \
- \
- /* sums_16 will outrange after 2 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32[0] = _mm256_add_epi32( \
- sums_32[0], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[0], 1)))); \
- sums_32[1] = _mm256_add_epi32( \
- sums_32[1], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[1], 1)))); \
- sums_32[2] = _mm256_add_epi32( \
- sums_32[2], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[2], 1)))); \
- sums_32[3] = _mm256_add_epi32( \
- sums_32[3], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[3], 1)))); \
- \
- src += src_stride << 1; \
- } \
- calc_final_4(sums_32, sad_array); \
+ highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
}
-// 64x64
-HIGHBD_SAD64XNX4D(64)
-
-// 64x32
-HIGHBD_SAD64XNX4D(32)
+#define HIGHBD_SADSKIP64XNx4D(n) \
+ void vpx_highbd_sad_skip_64x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
const uint16_t *src,
@@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
}
}
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *refs[4];
+ __m256i sums_16[4];
+ __m256i sums_32[4];
+ int i;
+
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+ sums_32[0] = _mm256_setzero_si256();
+ sums_32[1] = _mm256_setzero_si256();
+ sums_32[2] = _mm256_setzero_si256();
+ sums_32[3] = _mm256_setzero_si256();
+
+ for (i = 0; i < (n / 8); ++i) {
+ sums_16[0] = _mm256_setzero_si256();
+ sums_16[1] = _mm256_setzero_si256();
+ sums_16[2] = _mm256_setzero_si256();
+ sums_16[3] = _mm256_setzero_si256();
+
+ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32[0] = _mm256_add_epi32(
+ sums_32[0],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+ sums_32[1] = _mm256_add_epi32(
+ sums_32[1],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+ sums_32[2] = _mm256_add_epi32(
+ sums_32[2],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+ sums_32[3] = _mm256_add_epi32(
+ sums_32[3],
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+ src += src_stride << 3;
+ }
+ calc_final_4(sums_32, sad_array);
+}
+
#define HIGHBD_SAD32XNX4D(n) \
- void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[4], \
int ref_stride, uint32_t sad_array[4]) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *refs[4]; \
- __m256i sums_16[4]; \
- __m256i sums_32[4]; \
- int i; \
- \
- refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
- refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
- refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
- refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
- sums_32[0] = _mm256_setzero_si256(); \
- sums_32[1] = _mm256_setzero_si256(); \
- sums_32[2] = _mm256_setzero_si256(); \
- sums_32[3] = _mm256_setzero_si256(); \
- \
- for (i = 0; i < (n / 8); ++i) { \
- sums_16[0] = _mm256_setzero_si256(); \
- sums_16[1] = _mm256_setzero_si256(); \
- sums_16[2] = _mm256_setzero_si256(); \
- sums_16[3] = _mm256_setzero_si256(); \
- \
- highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \
- \
- /* sums_16 will outrange after 8 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32[0] = _mm256_add_epi32( \
- sums_32[0], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[0], 1)))); \
- sums_32[1] = _mm256_add_epi32( \
- sums_32[1], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[1], 1)))); \
- sums_32[2] = _mm256_add_epi32( \
- sums_32[2], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[2], 1)))); \
- sums_32[3] = _mm256_add_epi32( \
- sums_32[3], \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
- _mm256_cvtepu16_epi32( \
- _mm256_extractf128_si256(sums_16[3], 1)))); \
- \
- src += src_stride << 3; \
- } \
- calc_final_4(sums_32, sad_array); \
+ highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
}
-// 32x64
-HIGHBD_SAD32XNX4D(64)
-
-// 32x32
-HIGHBD_SAD32XNX4D(32)
-
-// 32x16
-HIGHBD_SAD32XNX4D(16)
+#define HIGHBD_SADSKIP32XNx4D(n) \
+ void vpx_highbd_sad_skip_32x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
const uint16_t *src,
@@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
}
}
-void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4],
- int ref_stride, uint32_t sad_array[4]) {
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+ int ref_stride, uint32_t sad_array[4], int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *refs[4];
__m256i sums_16[4];
__m256i sums_32[4];
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
int i;
refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
@@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums_32[2] = _mm256_setzero_si256();
sums_32[3] = _mm256_setzero_si256();
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < num_iters; ++i) {
sums_16[0] = _mm256_setzero_si256();
sums_16[1] = _mm256_setzero_si256();
sums_16[2] = _mm256_setzero_si256();
sums_16[3] = _mm256_setzero_si256();
- highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+ highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32[0] = _mm256_add_epi32(
@@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums_32, sad_array);
}
+#define HIGHBD_SAD16XNX4D(n) \
+ void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \
+ n); \
+ }
+
+#define HIGHBD_SADSKIP16XNx4D(n) \
+ void vpx_highbd_sad_skip_16x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, n / 2); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4],
int ref_stride, uint32_t sad_array[4]) {
@@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums_32, sad_array);
}
}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+ // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e01..a07892d81 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
; uint8_t *ref[4], int ref_stride,
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
%if UNIX64
cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4
%endif
+%else ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ;
+%endif ; sad/avg/skip
; set m1
push srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
pshufd m1, m1, 0x0
pop srcq
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
shl ref1q, 1
HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
+%undef rep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
; N.B. HIGH_PROCESS outputs dwords (32 bits)
; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
paddd m4, m0
paddd m6, m1
punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
movifnidn r4, r4mp
movu [r4], m4
RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8
HIGH_SADNXN4D 8, 4
HIGH_SADNXN4D 4, 8
HIGH_SADNXN4D 4, 4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 231b67f80..78f8eb8bf 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
}
}
-#define HIGHBD_SAD64XN(n) \
- unsigned int vpx_highbd_sad64x##n##_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
- __m256i sums_32 = _mm256_setzero_si256(); \
- int i; \
- \
- for (i = 0; i < (n / 2); ++i) { \
- __m256i sums_16 = _mm256_setzero_si256(); \
- \
- highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
- \
- /* sums_16 will outrange after 2 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32 = _mm256_add_epi32( \
- sums_32, \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
- _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
- \
- src += src_stride << 1; \
- ref += ref_stride << 1; \
- } \
- return calc_final(sums_32); \
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < (n / 2); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
+
+ highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 1;
+ ref += ref_stride << 1;
}
+ return calc_final(sums_32);
+}
-// 64x64
-HIGHBD_SAD64XN(64)
+#define HIGHBD_SAD64XN(n) \
+ unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
-// 64x32
-HIGHBD_SAD64XN(32)
+#define HIGHBD_SADSKIP64xN(n) \
+ unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
@@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
}
}
-#define HIGHBD_SAD32XN(n) \
- unsigned int vpx_highbd_sad32x##n##_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
- __m256i sums_32 = _mm256_setzero_si256(); \
- int i; \
- \
- for (i = 0; i < (n / 8); ++i) { \
- __m256i sums_16 = _mm256_setzero_si256(); \
- \
- highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
- \
- /* sums_16 will outrange after 8 rows, so add current sums_16 to \
- * sums_32*/ \
- sums_32 = _mm256_add_epi32( \
- sums_32, \
- _mm256_add_epi32( \
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
- _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
- \
- src += src_stride << 3; \
- ref += ref_stride << 3; \
- } \
- return calc_final(sums_32); \
- }
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+ __m256i sums_32 = _mm256_setzero_si256();
+ int i;
-// 32x64
-HIGHBD_SAD32XN(64)
+ for (i = 0; i < (n / 8); ++i) {
+ __m256i sums_16 = _mm256_setzero_si256();
-// 32x32
-HIGHBD_SAD32XN(32)
+ highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
-// 32x16
-HIGHBD_SAD32XN(16)
+ /* sums_16 will outrange after 8 rows, so add current sums_16 to
+ * sums_32*/
+ sums_32 = _mm256_add_epi32(
+ sums_32,
+ _mm256_add_epi32(
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+ src += src_stride << 3;
+ ref += ref_stride << 3;
+ }
+ return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n) \
+ unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP32xN(n) \
+ unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
const uint16_t *src, int src_stride,
@@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
}
}
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ int n) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
__m256i sums_32 = _mm256_setzero_si256();
+ const int height = VPXMIN(16, n);
+ const int num_iters = n / height;
int i;
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < num_iters; ++i) {
__m256i sums_16 = _mm256_setzero_si256();
- highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+ highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
sums_32 = _mm256_add_epi32(
@@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
return calc_final(sums_32);
}
+#define HIGHBD_SAD16XN(n) \
+ unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
+ }
+
+#define HIGHBD_SADSKIP16xN(n) \
+ unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+ n / 2); \
+ }
+
unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
@@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
}
}
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
// AVG -------------------------------------------------------------------------
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
const uint16_t *src,
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d6..62ad2237f 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
SECTION .text
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
%macro HIGH_SAD_FN 4
%if %4 == 0
%if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
%if %3 == 5
cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
-%endif ; avg/sad
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD64XN 1-2 0
HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
pxor m6, m6
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD32XN 1-2 0
HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
pxor m6, m6
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD16XN 1-2 0
HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
mov n_rowsd, %1/2
+%endif
pxor m0, m0
pxor m6, m6
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD8XN 1-2 0
HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
pxor m6, m6
@@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 000000000..752435d24
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b) \
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+ int stride) {
+ int i;
+ // Load 16x16 values
+ for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+ const __m128i in1 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+ const __m128i in2 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+ const __m128i in3 =
+ _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+ const __m128i ls = _mm_packs_epi32(in0, in1);
+ const __m128i rs = _mm_packs_epi32(in2, in3);
+ in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+ in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+ }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+ const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+ const __m256i t = _mm256_madd_epi16(*in, *cospi);
+ return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+ __m256i *x) {
+ const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+ const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+ return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+ __m256i *out0, __m256i *out1) {
+ __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+ __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+ __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+ __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+ *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+ *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+ // stage 4
+ butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+ &step2[10]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+ step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+ step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+ step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+ step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+ &step2[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+ &step2[12]);
+
+ // stage 7
+ out[0] = _mm256_add_epi16(step2[0], step1[15]);
+ out[1] = _mm256_add_epi16(step2[1], step1[14]);
+ out[2] = _mm256_add_epi16(step2[2], step2[13]);
+ out[3] = _mm256_add_epi16(step2[3], step2[12]);
+ out[4] = _mm256_add_epi16(step2[4], step2[11]);
+ out[5] = _mm256_add_epi16(step2[5], step2[10]);
+ out[6] = _mm256_add_epi16(step2[6], step1[9]);
+ out[7] = _mm256_add_epi16(step2[7], step1[8]);
+ out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+ out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+ out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+ out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+ out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+ out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+ out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+ out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+ d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+ d0 = _mm256_unpacklo_epi8(d0, zero);
+ d0 = _mm256_add_epi16(in_x, d0);
+ d0 = _mm256_packus_epi16(
+ d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+ _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ __m256i out;
+ out = _mm256_adds_epi16(in, final_rounding);
+ out = _mm256_srai_epi16(out, 6);
+ recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+ const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm256_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm256_srai_epi16(in[j], 6);
+ in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+ recon_and_store16(dst, in[j]);
+ dst += stride;
+ recon_and_store16(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+ int i;
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + (idx)] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + (idx)] = _mm256_inserti128_si256( \
+ t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ __m256i in[16];
+
+ // Load 16x16 values
+ idct_load16x16(input, in, 16);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ transpose_16bit_16x16_avx2(in, in);
+ idct16_16col(in, in);
+
+ for (i = 0; i < 16; ++i) {
+ write_buffer_16x1(dest + i * stride, in[i]);
+ }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+ __m256i step1[8], step2[8];
+
+ // stage 3
+ butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+ butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+ // stage 4
+ butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+ butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+ step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+ // stage 5
+ step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+
+ // stage 6
+ out[0] = _mm256_add_epi16(step1[0], step1[7]);
+ out[1] = _mm256_add_epi16(step1[1], step1[6]);
+ out[2] = _mm256_add_epi16(step1[2], step1[5]);
+ out[3] = _mm256_add_epi16(step1[3], step1[4]);
+ out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+ out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+ &step2[14]);
+ butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+ &step2[13]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+ step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+ step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+ step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+ step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+ // stage 6
+ out[8] = step1[8];
+ out[9] = step1[9];
+ butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+ &out[13]);
+ butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+ &out[12]);
+ out[14] = step1[14];
+ out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+ __m256i step1[16], step2[16];
+
+ // stage 2
+ butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+ butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+ butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+ butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+ // stage 3
+ step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+ step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+ step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+ step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+ step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+ step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+ step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+ step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+ idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+ __m256i *out) {
+ __m256i step2[32];
+
+ // stage 4
+ step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+ step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+ step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+ step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+ step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+ // stage 5
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+ &step1[29]);
+ butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+ &step1[28]);
+ butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+ &step1[27]);
+ butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+ &step1[26]);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ out[16] = _mm256_add_epi16(step1[16], step1[23]);
+ out[17] = _mm256_add_epi16(step1[17], step1[22]);
+ out[18] = _mm256_add_epi16(step1[18], step1[21]);
+ out[19] = _mm256_add_epi16(step1[19], step1[20]);
+ step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+ step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+ step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+ step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+ step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+ step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+ out[28] = _mm256_add_epi16(step1[27], step1[28]);
+ out[29] = _mm256_add_epi16(step1[26], step1[29]);
+ out[30] = _mm256_add_epi16(step1[25], step1[30]);
+ out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+ // stage 7
+ butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+ &out[27]);
+ butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+ &out[26]);
+ butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+ &out[25]);
+ butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+ &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+ __m256i temp[16];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ idct32_1024_16x32_quarter_1(in, temp);
+
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_2(in, temp);
+
+ // stage 7
+ add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+ __m256i step1[32], step2[32];
+
+ // stage 1
+ butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+ butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+ butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+ butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+ butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+ butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+ butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+ butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+ // stage 2
+ step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+ step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+ step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+ step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+ step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+ step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+ step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+ step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+ step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+ step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+ step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+ step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+ step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+ step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+ step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+ step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+ // stage 3
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+ &step1[30]);
+ butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+ &step1[29]);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+ &step1[26]);
+ butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+ &step1[25]);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+ __m256i temp[32];
+
+ // For each 16x32 block __m256i in[32],
+ // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+ // output pixels: 0-7 in __m256i out[32]
+ // AND
+ // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+ // output pixels: 8-15 in __m256i out[32]
+ idct32_1024_16x32_quarter_1_2(in, temp);
+
+ // For each 16x32 block __m256i in[32],
+ // Input with odd index,
+ // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ // output pixels: 16-23, 24-31 in __m256i out[32]
+ idct32_1024_16x32_quarter_3_4(in, temp);
+
+ // final stage
+ add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i l[32], r[32], out[32], *in;
+ int i;
+
+ in = l;
+
+ for (i = 0; i < 2; i++) {
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+
+ idct_load16x16(input + 16, in + 16, 32);
+ transpose_16bit_16x16_avx2(in + 16, in + 16);
+ idct32_1024_16x32(in, in);
+
+ in = r;
+ input += 32 << 4;
+ }
+
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(l + i, out);
+ transpose_16bit_16x16_avx2(r + i, out + 16);
+ idct32_1024_16x32(out, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m256i in[32], io[32], out[32];
+ int i;
+
+ for (i = 16; i < 32; i++) {
+ in[i] = _mm256_setzero_si256();
+ }
+
+ // rows
+ idct_load16x16(input, in, 32);
+ transpose_16bit_16x16_avx2(in, in);
+ idct32_1024_16x32(in, io);
+
+ // columns
+ for (i = 0; i < 32; i += 16) {
+ transpose_16bit_16x16_avx2(io + i, in);
+ idct32_1024_16x32(in, out);
+
+ store_buffer_16x32(out, dest, stride);
+ dest += 16;
+ }
+}
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d8352721..5ff5abc11 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,17 +19,18 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
const __m256i big_zero = _mm256_setzero_si256();
int index;
+ const int16_t *iscan = scan_order->iscan;
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1;
@@ -38,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan;
-
*eob_ptr = 0;
- load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
- dequant_ptr, &dequant, quant_shift_ptr, &shift);
+ load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
@@ -140,17 +138,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
const __m256i big_zero = _mm256_setzero_si256();
int index;
+ const int16_t *iscan = scan_order->iscan;
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1;
@@ -159,27 +155,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan;
- (void)n_coeffs;
-
- // Setup global values.
- // The 32x32 halves zbin and round.
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- // Shift with rounding.
- zbin = _mm_add_epi16(zbin, one);
- zbin = _mm_srli_epi16(zbin, 1);
- // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
- // it is a strict "greater" comparison.
- zbin = _mm_sub_epi16(zbin, one);
-
- round = _mm_load_si128((const __m128i *)round_ptr);
- round = _mm_add_epi16(round, one);
- round = _mm_srli_epi16(round, 1);
-
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- shift = _mm_slli_epi16(shift, 1);
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7d..d4872f6bc 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,13 +13,15 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
static VPX_FORCE_INLINE void load_b_values_avx2(
- const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
- __m256i *round, const int16_t *quant_ptr, __m256i *quant,
- const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
+ __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
__m256i *shift, int log_scale) {
- *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
*zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
if (log_scale > 0) {
const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -30,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
// calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
*zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
- *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
*round = _mm256_permute4x64_epi64(*round, 0x54);
if (log_scale > 0) {
const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -38,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
*round = _mm256_srai_epi16(*round, log_scale);
}
- *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
*quant = _mm256_permute4x64_epi64(*quant, 0x54);
*dequant =
_mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
*dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
- *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_castsi128_si256(
+ _mm_load_si128((const __m128i *)mb_plane->quant_shift));
*shift = _mm256_permute4x64_epi64(*shift, 0x54);
}
@@ -151,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
}
void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
__m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
__m256i v_eobmax = _mm256_setzero_si256();
intptr_t count;
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
- load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
- &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
- &v_quant_shift, 0);
+ load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+ &v_dequant, &v_quant_shift, 0);
// Do DC and first 15 AC.
v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
&v_dequant, &v_round, &v_zbin, &v_quant_shift);
@@ -250,23 +252,18 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
}
}
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
__m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
__m256i v_eobmax = _mm256_setzero_si256();
intptr_t count;
- (void)n_coeffs;
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
- load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
- &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
- &v_quant_shift, 1);
+ load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+ &v_dequant, &v_quant_shift, 1);
// Do DC and first 15 AC.
v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 9533e7916..64838eaa7 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -16,16 +16,16 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
int index = 16;
+ const int16_t *iscan = scan_order->iscan;
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
@@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i cmp_mask0, cmp_mask1;
__m128i eob, eob0;
- (void)scan;
-
// Setup global values.
- load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
- dequant_ptr, &dequant, quant_shift_ptr, &shift);
+ load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41..82c755a0c 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,26 +15,53 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
- const int16_t *round_ptr, __m128i *round,
- const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+ __m128i *zbin, __m128i *round, __m128i *quant,
const int16_t *dequant_ptr, __m128i *dequant,
- const int16_t *shift_ptr, __m128i *shift) {
- *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- *round = _mm_load_si128((const __m128i *)round_ptr);
- *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+ *round = _mm_load_si128((const __m128i *)mb_plane->round);
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
*zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
*dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- *shift = _mm_load_si128((const __m128i *)shift_ptr);
+ *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
}
-static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
- const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values32x32(
+ const struct macroblock_plane *const mb_plane, __m128i *zbin,
+ __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+ __m128i *dequant, __m128i *shift) {
+ const __m128i one = _mm_set1_epi16(1);
+ // The 32x32 halves zbin and round.
+ *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+ // Shift with rounding.
+ *zbin = _mm_add_epi16(*zbin, one);
+ *zbin = _mm_srli_epi16(*zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ *zbin = _mm_sub_epi16(*zbin, one);
+
+ *round = _mm_load_si128((const __m128i *)mb_plane->round);
+ *round = _mm_add_epi16(*round, one);
+ *round = _mm_srli_epi16(*round, 1);
+
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+ // I suspect this is not technically OK because quant_shift can be up
+ // to 1 << 16 and shifting up again will outrange that, but the test is not
+ // comprehensive enough to catch that and "it's been that way forever"
+ *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+ __m128i *round, __m128i *quant,
const int16_t *dequant_ptr,
__m128i *dequant) {
- *round = _mm_load_si128((const __m128i *)round_ptr);
- *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
*dequant = _mm_load_si128((const __m128i *)dequant_ptr);
}
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286..2c6d851a1 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,16 +16,17 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
int index = 16;
+ const int16_t *iscan = scan_order->iscan;
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1;
@@ -33,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i cmp_mask0, cmp_mask1;
__m128i eob, eob0;
- (void)scan;
-
- load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
- dequant_ptr, &dequant, quant_shift_ptr, &shift);
+ load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
@@ -107,17 +105,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
int index;
+ const int16_t *iscan = scan_order->iscan;
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1;
@@ -126,30 +121,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan;
- (void)n_coeffs;
-
- // Setup global values.
- // The 32x32 halves zbin and round.
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- // Shift with rounding.
- zbin = _mm_add_epi16(zbin, one);
- zbin = _mm_srli_epi16(zbin, 1);
- // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
- // it is a strict "greater" comparison.
- zbin = _mm_sub_epi16(zbin, one);
-
- round = _mm_load_si128((const __m128i *)round_ptr);
- round = _mm_add_epi16(round, one);
- round = _mm_srli_epi16(round, 1);
-
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- // I suspect this is not technically OK because quant_shift can be up
- // to 1 << 16 and shifting up again will outrange that, but the test is not
- // comprehensive enough to catch that and "it's been that way forever"
- shift = _mm_slli_epi16(shift, 1);
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 399b67b3f..cf7111983 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
_mm_storeu_si128((__m128i *)sad_array, sum);
}
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4], int ref_stride,
- uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
int i;
const uint8_t *refs[4];
__m256i sums[4];
@@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums[2] = _mm256_setzero_si256();
sums[3] = _mm256_setzero_si256();
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < h; i++) {
__m256i r[4];
// load src and all ref[]
@@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[4], int ref_stride,
- uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4],
+ int ref_stride, int h,
+ uint32_t sad_array[4]) {
__m256i sums[4];
int i;
const uint8_t *refs[4];
@@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
sums[2] = _mm256_setzero_si256();
sums[3] = _mm256_setzero_si256();
- for (i = 0; i < 64; i++) {
+ for (i = 0; i < h; i++) {
__m256i r_lo[4], r_hi[4];
// load 64 bytes from src and all ref[]
const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
+
+#define SAD64_H(h) \
+ void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+#define SAD32_H(h) \
+ void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+ }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h) \
+ void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define SADS32_H(h) \
+ void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ ((h) >> 1), sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9..ed4ea3ef9 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,7 +179,16 @@ SECTION .text
; uint8_t *ref[4], int ref_stride,
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1 ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+%else ; normal sad
%if UNIX64
cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4
@@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4
%endif
+%endif
+%if %3 == 1
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
mov ref1q, [ref1q+gprsize*0]
PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1 ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
+%undef num_rep
PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
%if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
punpckhqdq m5, m7
movifnidn r4, r4mp
paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
movu [r4], m4
RET
%else
movifnidn r4, r4mp
pshufd m6, m6, 0x08
pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
movq [r4+0], m6
movq [r4+8], m7
RET
@@ -237,3 +264,15 @@ SADNXN4D 8, 8
SADNXN4D 8, 4
SADNXN4D 4, 8
SADNXN4D 4, 4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 29bedb0e6..e00494d76 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,73 +11,104 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i, res;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ const int ref2_stride = ref_stride << 1;
+ const int src2_stride = src_stride << 1;
+ const int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ res = _mm_cvtsi128_si32(sum_sad128);
+ return res;
+}
+
#define FSAD64_H(h) \
unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0; i < h; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref_stride; \
- src_ptr += src_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int vpx_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
}
#define FSAD32_H(h) \
unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0; i < max; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref2_stride; \
- src_ptr += src2_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- return res; \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int vpx_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
}
-#define FSAD64 \
- FSAD64_H(64) \
- FSAD64_H(32)
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
-#define FSAD32 \
- FSAD32_H(64) \
- FSAD32_H(32) \
- FSAD32_H(16)
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
FSAD64
FSAD32
@@ -86,6 +117,8 @@ FSAD32
#undef FSAD32
#undef FSAD64_H
#undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
#define FSADAVG64_H(h) \
unsigned int vpx_sad64x##h##_avg_avx2( \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e9..627e463bf 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
SECTION .text
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
%macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
%if %3 == 5
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
%if %3 == 5
cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
mov n_rowsd, %1
+%endif
pxor m0, m0
.loop:
movu m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD32XN 1-2 0
SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
mov n_rowsd, %1/2
+%endif
pxor m0, m0
.loop:
movu m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
SAD32XN 64, 1 ; sad32x64_avg_sse2
SAD32XN 32, 1 ; sad32x32_avg_sse2
SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD16XN 1-2 0
SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2
SAD16XN 32, 1 ; sad16x32_avg_sse2
SAD16XN 16, 1 ; sad16x16_avg_sse2
SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD8XN 1-2 0
SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2
SAD8XN 16, 1 ; sad8x16_avg_sse2
SAD8XN 8, 1 ; sad8x8_avg_sse2
SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD4XN 1-2 0
SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
mov n_rowsd, %1/4
+%endif
pxor m0, m0
.loop:
@@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2
movhlps m1, m0
paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
movd eax, m0
RET
%endmacro
@@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
SAD4XN 8, 1 ; sad4x8_avg_sse
SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 8, 2 ; sad4x8_skip_sse
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..917ff0ef1
--- /dev/null
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+ const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+ const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+ const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+ const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+ const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+ int64_t sum;
+ __m256i zero = _mm256_setzero_si256();
+ const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+ const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+ const __m256i sum0_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+ const __m256i sum1_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+ int64_t sum;
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+ _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+ return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = load_unaligned_u32(a);
+ const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+ const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
+ const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
+ const __m128i v_b0 = load_unaligned_u32(b);
+ const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+ const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
+ const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
+ const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+ _mm_unpacklo_epi32(v_a2, v_a3));
+ const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+ _mm_unpacklo_epi32(v_b2, v_b3));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+ const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ __m256i sum = _mm256_setzero_si256();
+ __m256i zero = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+ const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
+ const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+ const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
+ const __m256i v_a =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+ const __m256i v_b =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+ const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+ const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+ const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+ const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+ const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+ const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+ const __m256i temp =
+ _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+ _mm256_madd_epi16(v_bsub, v_bsub));
+ sum = _mm256_add_epi32(sum, temp);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default:
+ if ((width & 0x07) == 0) {
+ do {
+ int i = 0;
+ do {
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ const uint8_t *a2;
+ const uint8_t *b2;
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ a2 = a + i + (a_stride << 1);
+ b2 = b + i + (b_stride << 1);
+ sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ }
+ sse = summary_all_avx2(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+ const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
+ const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
+ const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+ const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
+ const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
+ const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
+ const __m256i v_a_w =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+ const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
+ const __m256i v_b_w =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
+ const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
+ const __m256i v_a_w =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+ const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
+ const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
+ const __m256i v_b_w =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ int i = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ const uint16_t *a2;
+ const uint16_t *b2;
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ a2 = a + i + (a_stride << 1);
+ b2 = b + i + (b_stride << 1);
+ highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ summary_32_avx2(&sum32, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ l += 2;
+ } while (l < 8 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ sse = summary_4x64_avx2(sum);
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..4a7585c57
--- /dev/null
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+ int64_t sum;
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+ const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+ *sum64 = _mm_add_epi64(sum0, *sum64);
+ *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+ const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+ const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+ const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+ const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+ const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m128i *sum) {
+ const __m128i v_a0 = load_unaligned_u32(a);
+ const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+ const __m128i v_b0 = load_unaligned_u32(b);
+ const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+ const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+ __m128i *sum) {
+ const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y = 0;
+ int64_t sse = 0;
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ sse8_sse4_1(a, b, &sum);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default:
+ if (width & 0x07) {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ }
+ sse = summary_all_sse4(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+ const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+ const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
+ const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ _mm_storel_epi64((__m128i *)&sse,
+ _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ _mm_storel_epi64((__m128i *)&sse,
+ _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 16 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 16;
+ } while (y < height);
+ _mm_storel_epi64((__m128i *)&sse,
+ _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ __m128i sum32 = _mm_setzero_si128();
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ summary_32_sse4(&sum32, &sum);
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 8 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ _mm_storel_epi64((__m128i *)&sse,
+ _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb85..e3055ab29 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -124,4 +124,5 @@ INIT_MMX
lea predq, [predq+pred_str*2]
sub rowsd, 2
jg .loop_4
+ emms
RET
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 35925d590..8305b9f20 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
return _mm256_add_epi32(sum_lo, sum_hi);
}
+static INLINE void variance8_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ __m128i src0, src1, ref0, ref1;
+ __m256i ss, rr, diff;
+
+ // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+ src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+ // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+ src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+ src0 = _mm_unpacklo_epi64(src0, src1);
+
+ // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+ ss = _mm256_cvtepu8_epi16(src0);
+
+ // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+ ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+ // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+ ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+ ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+ // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+ rr = _mm256_cvtepu8_epi16(ref0);
+
+ diff = _mm256_sub_epi16(ss, rr);
+ *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+ *sum = _mm256_add_epi16(*sum, diff);
+}
+
static INLINE void variance16_kernel_avx2(
const uint8_t *const src, const int src_stride, const uint8_t *const ref,
const int ref_stride, __m256i *const sse, __m256i *const sum) {
@@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src,
variance_kernel_avx2(s, r, sse, sum);
}
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ int i;
+ *vsum = _mm256_setzero_si256();
+ *vsse = _mm256_setzero_si256();
+
+ for (i = 0; i < h; i += 2) {
+ variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
const uint8_t *ref, const int ref_stride,
const int h, __m256i *const vsse,
@@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse) {
+ __m256i vsse, vsum;
+ int sum;
+ variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+ variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+ return *sse - ((sum * sum) >> 7);
+}
+
unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse) {
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index c7d880860..526c28382 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
#include "vpx_dsp/x86/convolve.h"
#include "vpx_dsp/x86/convolve_avx2.h"
#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
#include "vpx_ports/mem.h"
// filters for 16_h8
@@ -38,6 +39,27 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+#define CALC_CONVOLVE8_HORZ_ROW \
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \
+ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \
+ s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \
+ s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \
+ s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \
+ s1[0] = convolve8_16_avx2(s1, f1); \
+ s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \
+ src_ptr += src_stride; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+ output_ptr += output_pitch; \
+ _mm_storel_epi64((__m128i *)&output_ptr[0], \
+ _mm256_extractf128_si256(s1[0], 1)); \
+ output_ptr += output_pitch;
+
static INLINE void vpx_filter_block1d16_h8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -61,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
__m256i srcReg;
// load the 2 strides of source
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -77,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -97,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
src_ptr += src_stride;
- // average if necessary
- outReg1 = _mm256_castsi256_si128(outReg32b1);
- outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+ outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
}
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + output_pitch), &outReg32b1);
output_ptr += dst_stride;
}
// if the number of strides is odd.
// process only 16 bytes
if (i > 0) {
- __m128i srcReg;
-
- // load the first 16 bytes of the last row
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ const __m256i srcReg =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
// filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg1 = convolve8_8_avx2(s, f);
-
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
- // filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg2 = convolve8_8_avx2(s, f);
+ // The low and high 128-bits of each lane contain the first and second
+ // convolve result respectively
+ outReg32b1 = convolve8_16_avx2(s, f);
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
+ // shrink to 8 bit each 16 bits
outReg1 = _mm_packus_epi16(outReg1, outReg2);
// average if necessary
@@ -177,11 +166,63 @@ static void vpx_filter_block1d16_h8_avg_avx2(
output_height, filter, 1);
}
+static void vpx_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i filt[4], f1[4], s1[4], srcReg;
+ __m128i f[4], s[4];
+ int y = output_height;
+
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ shuffle_filter_avx2(filter, f1);
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // Process next 4 rows
+ while (y > 3) {
+ CALC_CONVOLVE8_HORZ_ROW
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 4;
+ }
+
+ // If remaining, then process 2 rows at a time
+ while (y > 1) {
+ CALC_CONVOLVE8_HORZ_ROW
+ y -= 2;
+ }
+
+ // For the remaining height.
+ if (y > 0) {
+ const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ f[0] = _mm256_castsi256_si128(f1[0]);
+ f[1] = _mm256_castsi256_si128(f1[1]);
+ f[2] = _mm256_castsi256_si128(f1[2]);
+ f[3] = _mm256_castsi256_si128(f1[3]);
+
+ // filter the source buffer
+ s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+ s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+ s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+ s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+ s[0] = convolve8_8_ssse3(s, f);
+
+ // Saturate 16bit value to 8bit.
+ s[0] = _mm_packus_epi16(s[0], s[0]);
+
+ // Save only 8 bytes
+ _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+ }
+}
+
static INLINE void vpx_filter_block1d16_v8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
const int avg) {
- __m128i outReg1, outReg2;
__m256i srcRegHead1;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
@@ -260,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
src_ptr += src_stride;
// average if necessary
- outReg1 = _mm256_castsi256_si128(s1[0]);
- outReg2 = _mm256_extractf128_si256(s1[0], 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+ s1[0] = _mm256_avg_epu8(s1[0], outReg);
}
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + out_pitch), s1);
output_ptr += dst_stride;
@@ -534,9 +570,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
int h;
- __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
- __m256i dst_reg;
- __m256i tmp_0, tmp_1;
__m256i idx_shift_0 =
_mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
@@ -557,9 +590,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
for (h = height; h >= 2; h -= 2) {
// Load the source
- src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
- src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
- src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+ const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ __m256i dst_reg;
+ __m256i tmp_0, tmp_1;
+ const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
// Get the output
tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
@@ -580,9 +615,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
// Repeat for the last row if needed
if (h > 0) {
- __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
__m128i dst_reg;
- const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
__m128i tmp_0, tmp_1;
__m128i src_reg_shift_0 =
@@ -596,7 +631,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
_mm256_castsi256_si128(kernel_reg_45));
dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
- dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
+ dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
@@ -715,8 +750,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
const ptrdiff_t unrolled_src_stride = src_stride << 1;
const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
- __m256i src_reg, src_reg_shuf;
- __m256i dst;
__m256i shuf_idx =
_mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
@@ -733,12 +766,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
for (h = height; h > 1; h -= 2) {
// Load the source
- src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
- (const __m128i *)(src_ptr + src_stride));
- src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+ const __m256i src_reg = mm256_loadu2_epi64(
+ (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+ const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
// Get the result
- dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+ __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
// Round result
@@ -757,7 +790,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
if (h > 0) {
// Load the source
- const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding
__m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
__m128i src_reg_shuf =
_mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
@@ -768,7 +801,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
// Round result
- dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
+ dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
// Pack to 8-bits
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
@@ -866,22 +899,399 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
}
}
+static void vpx_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i s[9];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ shuffle_filter_avx2(filter, f);
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // merge the result together
+ // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+ // r07 r06 r05 r04 r03 r02 r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+ // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+ // r17 r16 r15 r14 r13 r12 r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+ // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+ // r27 r26 r25 r24 r23 r22 r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+ // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+ // r37 r36 r35 r34 r33 r32 r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+ // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+ // r47 r46 r45 r44 r43 r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+ // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+ // r57 r56 r55 r54 r53 r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+ // Merge together
+ // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+ // r01|r10 r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+ // r21|r30 r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+ // r41|r50 r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // Process 2 rows at a time
+ do {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+ // 0 r67 r66 r65 r64 r63 r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+ // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+ // 0 r77 r76 r75 r74 r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+ // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+ // r62 | r71 r61|r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ ss[0] = convolve8_16_avx2(ss, f);
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ /* shift down two rows */
+ s[6] = s[8];
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+ output_ptr += out_pitch;
+ _mm_storel_epi64((__m128i *)&output_ptr[0],
+ _mm256_extractf128_si256(ss[0], 1));
+ output_ptr += out_pitch;
+ ss[0] = ss[1];
+ ss[1] = ss[2];
+ ss[2] = ss[3];
+ y -= 2;
+ } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64_256bit;
+ unsigned int y = output_height;
+
+ assert(output_height > 1);
+
+ addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+ // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ {
+ ptrdiff_t src_stride;
+ __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+ // have the same data in both lanes of a 256 bit register
+ // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+ // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+ const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+ // duplicate only the first 32 bits
+ // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+ // f0|f3 f2 f1 f0|f3 f2 f1 f0
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+ // f4|f7 f6 f5 f4|f7 f6 f5 f4
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+ // s2 s4 s3 s2 s1 s3 s2 s1 s0
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+ // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+ // s6 s8 s7 s6 s5 s7 s6 s5 s4
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+
+ do {
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+ // load the 2 strides of source
+ // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+ // r06 r05 r04 r03 r02 r01 r00
+ srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
+
+ // filter the source buffer
+ // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+ // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+ // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+ // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+ // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+ // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+ // save first row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+ output_ptr += output_pitch;
+
+ // save second row 4 values
+ *((int *)&output_ptr[0]) =
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr += output_pitch;
+
+ y = y - 2;
+ } while (y > 1);
+
+ // For remaining height
+ if (y > 0) {
+ __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+ __m128i srcRegFilt2;
+
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1 =
+ _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+ }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[9], rr[2];
+ __m128i s[11];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by four
+ const ptrdiff_t src_stride = src_pitch << 2;
+ const ptrdiff_t out_stride = out_pitch << 2;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 0x01));
+
+ shuffle_filter_avx2(filter, f);
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
+
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
+
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+ // r00|
+ ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+ // r20|
+ ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+ // Process 4 rows at a time
+ while (y >= 4) {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+ s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+ s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+ rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+ ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+ r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+ rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+ ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+ ss[0] = convolve8_16_avx2(ss, f);
+
+ // r3 r2 r3 r2 r1 r0 r1 r0
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ mm256_storeu2_epi32((__m128i *const)output_ptr,
+ (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+ ss[0] = _mm256_srli_si256(ss[0], 4);
+
+ mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+ (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+ output_ptr += out_stride;
+
+ ss[0] = ss[2];
+ ss[1] = ss[3];
+
+ s[6] = s[10];
+ s[5] = s[9];
+
+ r[4] = r[8];
+ y -= 4;
+ }
+
+ // Process 2 rows
+ if (y == 2) {
+ __m128i ss1[4], f1[4], r1[4];
+
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ f1[0] = _mm256_castsi256_si128(f[0]);
+ f1[1] = _mm256_castsi256_si128(f[1]);
+ f1[2] = _mm256_castsi256_si128(f[2]);
+ f1[3] = _mm256_castsi256_si128(f[3]);
+
+ r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+ r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
+ // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+ r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+ r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
+
+ // r23 r13....r20 r10|r13 r03....r10 r00
+ ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+ // r43 r33....r40 r30|r33 r23....r30 r20
+ ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+ // r63 r53....r60 r50|r53 r43....r50 r40
+ ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
+
+ // r83 r73....r80 r70|r73 r63....r70 r60
+ ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
+
+ ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+ // r1 r0 r1 r0
+ ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+ // Save first row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ output_ptr += out_pitch;
+
+ ss1[0] = _mm_srli_si128(ss1[0], 4);
+ // Save second row 4 values
+ *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+ }
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if VPX_ARCH_X86_64
filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else // VPX_ARCH_X86
+#else // VPX_ARCH_X86
filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
#endif // VPX_ARCH_X86_64
filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
@@ -897,7 +1307,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3