aboutsummaryrefslogtreecommitdiff
path: root/av1/common/x86/convolve_avx2.c
diff options
context:
space:
mode:
Diffstat (limited to 'av1/common/x86/convolve_avx2.c')
-rw-r--r--av1/common/x86/convolve_avx2.c86
1 files changed, 50 insertions, 36 deletions
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 89e0a4c8f..30de98232 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -13,16 +13,16 @@
#include "config/av1_rtcd.h"
+#include "third_party/SVT-AV1/convolve_avx2.h"
+
#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/synonyms.h"
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_y,
- const int subpel_y_qn) {
- int i, j, vert_tap = SUBPEL_TAPS;
+static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
// right shift is F-1 because we are already dividing
// filter co-efficients by 2
const int right_shift_bits = (FILTER_BITS - 1);
@@ -32,16 +32,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
__m256i coeffs[6], s[12];
__m128i d[10];
- // Condition for checking valid vert_filt taps
- const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
- if (filter_params_y->taps == 12) {
- vert_tap = 12;
- } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
- vert_tap = 4;
- } else if (!(filter[0] | filter[7])) {
- vert_tap = 6;
- }
+ int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
if (vert_tap == 6)
prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
@@ -55,7 +46,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
if (vert_tap == 4) {
const int fo_vert = 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride;
- for (j = 0; j < w; j += 16) {
+ for (int j = 0; j < w; j += 16) {
const uint8_t *data = &src_ptr[j];
d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
@@ -150,7 +141,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
const int fo_vert = vert_tap / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride;
- for (j = 0; j < w; j += 16) {
+ for (int j = 0; j < w; j += 16) {
const uint8_t *data = &src_ptr[j];
__m256i src6;
@@ -255,7 +246,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
right_shift = _mm_cvtsi32_si128(FILTER_BITS);
right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
- for (j = 0; j < w; j += 8) {
+ for (int j = 0; j < w; j += 8) {
const uint8_t *data = &src_ptr[j];
__m256i src10;
@@ -403,7 +394,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride;
- for (j = 0; j < w; j += 16) {
+ for (int j = 0; j < w; j += 16) {
const uint8_t *data = &src_ptr[j];
__m256i src6;
@@ -517,18 +508,33 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
}
}
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const int subpel_x_qn,
- ConvolveParams *conv_params) {
+void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_y,
+ const int32_t subpel_y_q4) {
+ const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ if (vert_tap == 12) {
+ av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ } else {
+ av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ }
+}
+
+static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
const int bits = FILTER_BITS - conv_params->round_0;
const __m128i round_shift = _mm_cvtsi32_si128(bits);
__m256i round_0_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
__m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
__m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
- int i, horiz_tap = SUBPEL_TAPS;
+ int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -539,16 +545,6 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- if (filter_params_x->taps == 12) {
- horiz_tap = 12;
- } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
- horiz_tap = 4;
- } else if (!(filter[0] | filter[7])) {
- horiz_tap = 6;
- }
-
if (horiz_tap == 6)
prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
else if (horiz_tap == 12) {
@@ -900,3 +896,21 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
}
}
}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_x,
+ const int32_t subpel_x_q4,
+ ConvolveParams *conv_params) {
+ const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+ if (horz_tap == 12) {
+ av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4, conv_params);
+ } else {
+ av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4,
+ conv_params);
+ }
+}