diff options
Diffstat (limited to 'av1/common/x86/convolve_avx2.c')
-rw-r--r-- | av1/common/x86/convolve_avx2.c | 86 |
1 files changed, 50 insertions, 36 deletions
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c index 89e0a4c8f..30de98232 100644 --- a/av1/common/x86/convolve_avx2.c +++ b/av1/common/x86/convolve_avx2.c @@ -13,16 +13,16 @@ #include "config/av1_rtcd.h" +#include "third_party/SVT-AV1/convolve_avx2.h" + #include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/synonyms.h" -void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_y, - const int subpel_y_qn) { - int i, j, vert_tap = SUBPEL_TAPS; +static AOM_INLINE void av1_convolve_y_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { // right shift is F-1 because we are already dividing // filter co-efficients by 2 const int right_shift_bits = (FILTER_BITS - 1); @@ -32,16 +32,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, __m256i coeffs[6], s[12]; __m128i d[10]; - // Condition for checking valid vert_filt taps - const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_qn & SUBPEL_MASK); - if (filter_params_y->taps == 12) { - vert_tap = 12; - } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) { - vert_tap = 4; - } else if (!(filter[0] | filter[7])) { - vert_tap = 6; - } + int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); if (vert_tap == 6) prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); @@ -55,7 +46,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, if (vert_tap == 4) { const int fo_vert = 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; - for (j = 0; j < w; j += 16) { + for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); @@ -150,7 +141,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, const int fo_vert = vert_tap / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; - for (j = 0; j < w; j += 16) { + for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; @@ -255,7 +246,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, right_shift = _mm_cvtsi32_si128(FILTER_BITS); right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); - for (j = 0; j < w; j += 8) { + for (int j = 0; j < w; j += 8) { const uint8_t *data = &src_ptr[j]; __m256i src10; @@ -403,7 +394,7 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; - for (j = 0; j < w; j += 16) { + for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; @@ -517,18 +508,33 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, } } -void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const int subpel_x_qn, - ConvolveParams *conv_params) { +void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_y, + const int32_t subpel_y_q4) { + const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4); + + if (vert_tap == 12) { + av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } else { + av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } +} + +static AOM_INLINE void av1_convolve_x_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_shift = _mm_cvtsi32_si128(bits); __m256i round_0_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); - int i, horiz_tap = SUBPEL_TAPS; + int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || @@ -539,16 +545,6 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_qn & SUBPEL_MASK); - if (filter_params_x->taps == 12) { - horiz_tap = 12; - } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) { - horiz_tap = 4; - } else if (!(filter[0] | filter[7])) { - horiz_tap = 6; - } - if (horiz_tap == 6) prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); else if (horiz_tap == 12) { @@ -900,3 +896,21 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, } } } + +void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_x, + const int32_t subpel_x_q4, + ConvolveParams *conv_params) { + const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4); + + if (horz_tap == 12) { + av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, conv_params); + } else { + av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, + conv_params); + } +} |