1 files changed, 50 insertions, 36 deletions
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 89e0a4c8f..30de98232 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -13,16 +13,16 @@
 
 #include "config/av1_rtcd.h"
 
+#include "third_party/SVT-AV1/convolve_avx2.h"
+
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/synonyms.h"
 
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_y_qn) {
-  int i, j, vert_tap = SUBPEL_TAPS;
+static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
@@ -32,16 +32,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   __m256i coeffs[6], s[12];
   __m128i d[10];
 
-  // Condition for checking valid vert_filt taps
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  if (filter_params_y->taps == 12) {
-    vert_tap = 12;
-  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    vert_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    vert_tap = 6;
-  }
+  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
 
   if (vert_tap == 6)
     prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
@@ -55,7 +46,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   if (vert_tap == 4) {
     const int fo_vert = 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
@@ -150,7 +141,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     const int fo_vert = vert_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
 
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       __m256i src6;
 
@@ -255,7 +246,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     right_shift = _mm_cvtsi32_si128(FILTER_BITS);
     right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
 
-    for (j = 0; j < w; j += 8) {
+    for (int j = 0; j < w; j += 8) {
       const uint8_t *data = &src_ptr[j];
       __m256i src10;
 
@@ -403,7 +394,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
 
-    for (j = 0; j < w; j += 16) {
+    for (int j = 0; j < w; j += 16) {
       const uint8_t *data = &src_ptr[j];
       __m256i src6;
 
@@ -517,18 +508,33 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
+void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t w,
+                            int32_t h,
+                            const InterpFilterParams *filter_params_y,
+                            const int32_t subpel_y_q4) {
+  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+  if (vert_tap == 12) {
+    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_q4);
+  } else {
+    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+                                       filter_params_y, subpel_y_q4);
+  }
+}
+
+static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
   const int bits = FILTER_BITS - conv_params->round_0;
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
   __m256i round_0_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
-  int i, horiz_tap = SUBPEL_TAPS;
+  int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -539,16 +545,6 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  if (filter_params_x->taps == 12) {
-    horiz_tap = 12;
-  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    horiz_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    horiz_tap = 6;
-  }
-
   if (horiz_tap == 6)
     prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
   else if (horiz_tap == 12) {
@@ -900,3 +896,21 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
     }
   }
 }
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t w,
+                            int32_t h,
+                            const InterpFilterParams *filter_params_x,
+                            const int32_t subpel_x_q4,
+                            ConvolveParams *conv_params) {
+  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+  if (horz_tap == 12) {
+    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_q4, conv_params);
+  } else {
+    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+                                       filter_params_x, subpel_x_q4,
+                                       conv_params);
+  }
+}