diff options
Diffstat (limited to 'vpx_dsp/x86')
25 files changed, 3257 insertions, 544 deletions
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index b2e01319d..61e4e73c5 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, } #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = @@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); - __m256i b0 = _mm256_add_epi16(coeff0, coeff1); - __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); - __m256i b2 = _mm256_add_epi16(coeff2, coeff3); - __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); - b0 = _mm256_srai_epi16(b0, 2); - b1 = _mm256_srai_epi16(b1, 2); - b2 = _mm256_srai_epi16(b2, 2); - b3 = _mm256_srai_epi16(b3, 2); + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 015c11a1f..4447dfab7 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -15,6 +15,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; @@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); - b0 = _mm_srai_epi16(b0, 2); - b1 = _mm_srai_epi16(b1, 2); - b2 = _mm_srai_epi16(b2, 2); - b3 = _mm_srai_epi16(b3, 2); + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c new file mode 100644 index 000000000..f4357998c --- /dev/null +++ b/vpx_dsp/x86/avg_pred_avx2.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" + +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + // comp_pred and pred must be 32 byte aligned. + assert(((intptr_t)comp_pred % 32) == 0); + assert(((intptr_t)pred % 32) == 0); + + if (width == 8) { + assert(height % 4 == 0); + do { + const __m256i p = _mm256_load_si256((const __m256i *)pred); + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + const __m128i r_1 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + + const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); + const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); + const __m256i avg = _mm256_avg_epu8(p, ref_0123); + + _mm256_store_si256((__m256i *)comp_pred, avg); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + assert(height % 4 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + assert(height % 2 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + int x; + for (x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_load_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)(comp_pred + x), average_0); + _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); + } +} diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c index a2ed420e3..c8f54a49c 100644 --- a/vpx_dsp/x86/fwd_txfm_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -8,9 +8,382 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <immintrin.h> // AVX2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + #if !CONFIG_VP9_HIGHBITDEPTH #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 8edddd637..35ca55404 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,8 @@ #include <immintrin.h> #include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -26,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) { } } -static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const int16_t *quant_shift_ptr, - __m256i *qp, int log_scale) { - const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); - const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); +static VPX_FORCE_INLINE void init_qp( + const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin); + const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round); + const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); - const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + const __m128i quant_shift = + _mm_loadu_si128((const __m128i *)mb_plane->quant_shift); init_one_qp(&zbin, &qp[0]); init_one_qp(&round, &qp[1]); init_one_qp(&quant, &qp[2]); @@ -134,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp, } void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; + const int16_t *iscan = scan_order->iscan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + init_qp(mb_plane, dequant_ptr, qp, 0); quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); @@ -222,17 +219,16 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + init_qp(mb_plane, dequant_ptr, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index ae1981a83..adae60756 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,19 +15,22 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" -#if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; + const int16_t *iscan = scan_order->iscan; + const int16_t *zbin_ptr = mb_plane->zbin; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); @@ -38,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - (void)scan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); @@ -93,19 +94,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; + const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); @@ -140,14 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob; } -#endif diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 947b5e977..e483fdce7 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD64XNX4D(n) \ - void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 2); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 1; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 64x64 -HIGHBD_SAD64XNX4D(64) - -// 64x32 -HIGHBD_SAD64XNX4D(32) +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD32XNX4D(n) \ - void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 8); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 3; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 32x64 -HIGHBD_SAD32XNX4D(64) - -// 32x32 -HIGHBD_SAD32XNX4D(32) - -// 32x16 -HIGHBD_SAD32XNX4D(16) +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, } } -void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], - int ref_stride, uint32_t sad_array[4]) { +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); @@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums_32[2] = _mm256_setzero_si256(); sums_32[3] = _mm256_setzero_si256(); - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); - highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32[0] = _mm256_add_epi32( @@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { @@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } } + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm index 6c2a61e01..a07892d81 100644 --- a/vpx_dsp/x86/highbd_sad4d_sse2.asm +++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -213,7 +213,12 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad %if UNIX64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip ; set m1 push srcq @@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ pshufd m1, m1, 0x0 pop srcq +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM @@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif movifnidn r4, r4mp movu [r4], m4 RET @@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 231b67f80..78f8eb8bf 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, } } -#define HIGHBD_SAD64XN(n) \ - unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 2); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 1; \ - ref += ref_stride << 1; \ - } \ - return calc_final(sums_32); \ +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; } + return calc_final(sums_32); +} -// 64x64 -HIGHBD_SAD64XN(64) +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } -// 64x32 -HIGHBD_SAD64XN(32) +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, } } -#define HIGHBD_SAD32XN(n) \ - unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 8); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 3; \ - ref += ref_stride << 3; \ - } \ - return calc_final(sums_32); \ - } +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; -// 32x64 -HIGHBD_SAD32XN(64) + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); -// 32x32 -HIGHBD_SAD32XN(32) + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); -// 32x16 -HIGHBD_SAD32XN(16) + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { __m256i sums_16 = _mm256_setzero_si256(); - highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32 = _mm256_add_epi32( @@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, return calc_final(sums_32); } +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); @@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, } } +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + // AVG ------------------------------------------------------------------------- static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, const uint16_t *src, diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm index 6a1a6f3d6..62ad2237f 100644 --- a/vpx_dsp/x86/highbd_sad_sse2.asm +++ b/vpx_dsp/x86/highbd_sad_sse2.asm @@ -12,6 +12,11 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro HIGH_SAD_FN 4 %if %4 == 0 %if %3 == 5 @@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg +%elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 pxor m6, m6 @@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 pxor m6, m6 @@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 000000000..752435d24 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, + int stride) { + int i; + // Load 16x16 values + for (i = 0; i < 16; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); + const __m128i in1 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); + const __m128i in2 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); + const __m128i in3 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); + const __m128i ls = _mm_packs_epi32(in0, in1); + const __m128i rs = _mm_packs_epi32(in2, in3); + in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); +#else + in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); +#endif + } +} + +static INLINE __m256i dct_round_shift_avx2(__m256i in) { + const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); + return _mm256_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { + const __m256i t = _mm256_madd_epi16(*in, *cospi); + return dct_round_shift_avx2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, + __m256i *x) { + const __m256i t0 = idct_madd_round_shift_avx2(in0, x); + const __m256i t1 = idct_madd_round_shift_avx2(in1, x); + return _mm256_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, + __m256i *out0, __m256i *out1) { + __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); + __m256i cst1 = PAIR256_SET_EPI16(c1, c0); + __m256i lo = _mm256_unpacklo_epi16(in0, in1); + __m256i hi = _mm256_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); + *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); +} + +static INLINE void idct16_16col(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[10], step2[11]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step1[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step1[7] = _mm256_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm256_add_epi16(step1[0], step1[7]); + step2[1] = _mm256_add_epi16(step1[1], step1[6]); + step2[2] = _mm256_add_epi16(step1[2], step1[5]); + step2[3] = _mm256_add_epi16(step1[3], step1[4]); + step2[4] = _mm256_sub_epi16(step1[3], step1[4]); + step2[5] = _mm256_sub_epi16(step1[2], step1[5]); + step2[6] = _mm256_sub_epi16(step1[1], step1[6]); + step2[7] = _mm256_sub_epi16(step1[0], step1[7]); + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm256_add_epi16(step2[0], step1[15]); + out[1] = _mm256_add_epi16(step2[1], step1[14]); + out[2] = _mm256_add_epi16(step2[2], step2[13]); + out[3] = _mm256_add_epi16(step2[3], step2[12]); + out[4] = _mm256_add_epi16(step2[4], step2[11]); + out[5] = _mm256_add_epi16(step2[5], step2[10]); + out[6] = _mm256_add_epi16(step2[6], step1[9]); + out[7] = _mm256_add_epi16(step2[7], step1[8]); + out[8] = _mm256_sub_epi16(step2[7], step1[8]); + out[9] = _mm256_sub_epi16(step2[6], step1[9]); + out[10] = _mm256_sub_epi16(step2[5], step2[10]); + out[11] = _mm256_sub_epi16(step2[4], step2[11]); + out[12] = _mm256_sub_epi16(step2[3], step2[12]); + out[13] = _mm256_sub_epi16(step2[2], step2[13]); + out[14] = _mm256_sub_epi16(step2[1], step1[14]); + out[15] = _mm256_sub_epi16(step2[0], step1[15]); +} + +static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { + const __m256i zero = _mm256_setzero_si256(); + __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); + d0 = _mm256_permute4x64_epi64(d0, 0xd8); + d0 = _mm256_unpacklo_epi8(d0, zero); + d0 = _mm256_add_epi16(in_x, d0); + d0 = _mm256_packus_epi16( + d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); + + _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); +} + +static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + __m256i out; + out = _mm256_adds_epi16(in, final_rounding); + out = _mm256_srai_epi16(out, 6); + recon_and_store16(dest, out); +} + +static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm256_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm256_srai_epi16(in[j], 6); + in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); + + recon_and_store16(dst, in[j]); + dst += stride; + recon_and_store16(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); + +#define LOADR(idx) \ + t[8 + (idx)] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + (idx)] = _mm256_inserti128_si256( \ + t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + __m256i in[16]; + + // Load 16x16 values + idct_load16x16(input, in, 16); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + for (i = 0; i < 16; ++i) { + write_buffer_16x1(dest + i * stride, in[i]); + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +// For each 16x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { + __m256i step1[8], step2[8]; + + // stage 3 + butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm256_add_epi16(step1[0], step1[7]); + out[1] = _mm256_add_epi16(step1[1], step1[6]); + out[2] = _mm256_add_epi16(step1[2], step1[5]); + out[3] = _mm256_add_epi16(step1[3], step1[4]); + out[4] = _mm256_sub_epi16(step1[3], step1[4]); + out[5] = _mm256_sub_epi16(step1[2], step1[5]); + out[6] = _mm256_sub_epi16(step1[1], step1[6]); + out[7] = _mm256_sub_epi16(step1[0], step1[7]); +} + +static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], + &out[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], + &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[11], step2[10]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[15], step2[14]); + + idct32_16x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[16] = _mm256_add_epi16(step1[16], step1[19]); + step2[17] = _mm256_add_epi16(step1[17], step1[18]); + step2[18] = _mm256_sub_epi16(step1[17], step1[18]); + step2[19] = _mm256_sub_epi16(step1[16], step1[19]); + step2[20] = _mm256_sub_epi16(step1[23], step1[20]); + step2[21] = _mm256_sub_epi16(step1[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[22], step1[21]); + step2[23] = _mm256_add_epi16(step1[23], step1[20]); + + step2[24] = _mm256_add_epi16(step1[24], step1[27]); + step2[25] = _mm256_add_epi16(step1[25], step1[26]); + step2[26] = _mm256_sub_epi16(step1[25], step1[26]); + step2[27] = _mm256_sub_epi16(step1[24], step1[27]); + step2[28] = _mm256_sub_epi16(step1[31], step1[28]); + step2[29] = _mm256_sub_epi16(step1[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step1[30]); + step2[31] = _mm256_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm256_add_epi16(step1[16], step1[23]); + out[17] = _mm256_add_epi16(step1[17], step1[22]); + out[18] = _mm256_add_epi16(step1[18], step1[21]); + out[19] = _mm256_add_epi16(step1[19], step1[20]); + step2[20] = _mm256_sub_epi16(step1[19], step1[20]); + step2[21] = _mm256_sub_epi16(step1[18], step1[21]); + step2[22] = _mm256_sub_epi16(step1[17], step1[22]); + step2[23] = _mm256_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm256_sub_epi16(step1[31], step1[24]); + step2[25] = _mm256_sub_epi16(step1[30], step1[25]); + step2[26] = _mm256_sub_epi16(step1[29], step1[26]); + step2[27] = _mm256_sub_epi16(step1[28], step1[27]); + out[28] = _mm256_add_epi16(step1[27], step1[28]); + out[29] = _mm256_add_epi16(step1[26], step1[29]); + out[30] = _mm256_add_epi16(step1[25], step1[30]); + out[31] = _mm256_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], + &out[27]); + butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], + &out[26]); + butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], + &out[25]); + butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], + &out[24]); +} + +static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { + __m256i temp[16]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + idct32_1024_16x32_quarter_1(in, temp); + + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_2(in, temp); + + // stage 7 + add_sub_butterfly_avx2(temp, out, 16); +} + +// For each 16x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { + __m256i step1[32], step2[32]; + + // stage 1 + butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm256_add_epi16(step1[16], step1[17]); + step2[17] = _mm256_sub_epi16(step1[16], step1[17]); + step2[18] = _mm256_sub_epi16(step1[19], step1[18]); + step2[19] = _mm256_add_epi16(step1[19], step1[18]); + step2[20] = _mm256_add_epi16(step1[20], step1[21]); + step2[21] = _mm256_sub_epi16(step1[20], step1[21]); + step2[22] = _mm256_sub_epi16(step1[23], step1[22]); + step2[23] = _mm256_add_epi16(step1[23], step1[22]); + + step2[24] = _mm256_add_epi16(step1[24], step1[25]); + step2[25] = _mm256_sub_epi16(step1[24], step1[25]); + step2[26] = _mm256_sub_epi16(step1[27], step1[26]); + step2[27] = _mm256_add_epi16(step1[27], step1[26]); + step2[28] = _mm256_add_epi16(step1[28], step1[29]); + step2[29] = _mm256_sub_epi16(step1[28], step1[29]); + step2[30] = _mm256_sub_epi16(step1[31], step1[30]); + step2[31] = _mm256_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { + __m256i temp[32]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + // AND + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_1_2(in, temp); + + // For each 16x32 block __m256i in[32], + // Input with odd index, + // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + // output pixels: 16-23, 24-31 in __m256i out[32] + idct32_1024_16x32_quarter_3_4(in, temp); + + // final stage + add_sub_butterfly_avx2(temp, out, 32); +} + +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i l[32], r[32], out[32], *in; + int i; + + in = l; + + for (i = 0; i < 2; i++) { + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + + idct_load16x16(input + 16, in + 16, 32); + transpose_16bit_16x16_avx2(in + 16, in + 16); + idct32_1024_16x32(in, in); + + in = r; + input += 32 << 4; + } + + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(l + i, out); + transpose_16bit_16x16_avx2(r + i, out + 16); + idct32_1024_16x32(out, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} + +// Case when only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32], io[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm256_setzero_si256(); + } + + // rows + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + idct32_1024_16x32(in, io); + + // columns + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(io + i, in); + idct32_1024_16x32(in, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 7d8352721..5ff5abc11 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -19,17 +19,18 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -38,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - *eob_ptr = 0; - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -140,17 +138,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -159,27 +155,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - shift = _mm_slli_epi16(shift, 1); + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 28f7c9c7d..d4872f6bc 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,13 +13,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( - const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, - __m256i *round, const int16_t *quant_ptr, __m256i *quant, - const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, __m256i *shift, int log_scale) { - *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -30,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2( // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); - *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); *round = _mm256_permute4x64_epi64(*round, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -38,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2( *round = _mm256_srai_epi16(*round, log_scale); } - *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); - *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_shift)); *shift = _mm256_permute4x64_epi64(*shift, 0x54); } @@ -151,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { } void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 0); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 0); // Do DC and first 15 AC. v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); @@ -250,23 +252,18 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)n_coeffs; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 1); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 9533e7916..64838eaa7 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -16,16 +16,16 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; @@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - // Setup global values. - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e41..82c755a0c 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,26 +15,53 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" -static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, - const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_b_values(const struct macroblock_plane *const mb_plane, + __m128i *zbin, __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant, - const int16_t *shift_ptr, __m128i *shift) { - *zbin = _mm_load_si128((const __m128i *)zbin_ptr); - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); - *shift = _mm_load_si128((const __m128i *)shift_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); } -static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + +static INLINE void load_fp_values(const struct macroblock_plane *mb_plane, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant) { - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + *round = _mm_load_si128((const __m128i *)mb_plane->round_fp); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); } diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 476230286..2c6d851a1 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,16 +16,17 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -33,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -107,17 +105,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -126,30 +121,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 399b67b3f..cf7111983 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 32; i++) { + for (i = 0; i < h; i++) { __m256i r[4]; // load src and all ref[] @@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; @@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 64; i++) { + for (i = 0; i < h; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); @@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm index 3f6e55ce9..ed4ea3ef9 100644 --- a/vpx_dsp/x86/sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -179,7 +179,16 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mov ref1q, [ref1q+gprsize*0] PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 %if %1 > 4 @@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ punpckhqdq m5, m7 movifnidn r4, r4mp paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif movu [r4], m4 RET %else movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif movq [r4+0], m6 movq [r4+8], m7 RET @@ -237,3 +264,15 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index 29bedb0e6..e00494d76 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,73 +11,104 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD32_H(h) \ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } -#define FSAD64 \ - FSAD64_H(64) \ - FSAD64_H(32) +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) -#define FSAD32 \ - FSAD32_H(64) \ - FSAD32_H(32) \ - FSAD32_H(16) +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) FSAD64 FSAD32 @@ -86,6 +117,8 @@ FSAD32 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H #define FSADAVG64_H(h) \ unsigned int vpx_sad64x##h##_avg_avx2( \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index e4e1bc3e9..627e463bf 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -12,15 +12,29 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 -%if %4 == 0 +%if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..917ff0ef1 --- /dev/null +++ b/vpx_dsp/x86/sse_avx2.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> +#include <immintrin.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); + const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); + const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + const uint8_t *a2; + const uint8_t *b2; + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); + const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); + const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); + const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + const uint16_t *a2; + const uint16_t *b2; + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..4a7585c57 --- /dev/null +++ b/vpx_dsp/x86/sse_sse4.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm index 4273efb85..e3055ab29 100644 --- a/vpx_dsp/x86/subtract_sse2.asm +++ b/vpx_dsp/x86/subtract_sse2.asm @@ -124,4 +124,5 @@ INIT_MMX lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_4 + emms RET diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 35925d590..8305b9f20 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { return _mm256_add_epi32(sum_lo, sum_hi); } +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + static INLINE void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { @@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src, variance_kernel_avx2(s, r, sse, sum); } +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, @@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index c7d880860..526c28382 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" #include "vpx_ports/mem.h" // filters for 16_h8 @@ -38,6 +39,27 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +#define CALC_CONVOLVE8_HORZ_ROW \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ + s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ + s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ + s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ + s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \ + s1[0] = convolve8_16_avx2(s1, f1); \ + s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \ + src_ptr += src_stride; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \ + output_ptr += output_pitch; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], \ + _mm256_extractf128_si256(s1[0], 1)); \ + output_ptr += output_pitch; + static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, @@ -61,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( __m256i srcReg; // load the 2 strides of source - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -77,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -97,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( src_ptr += src_stride; - // average if necessary - outReg1 = _mm256_castsi256_si128(outReg32b1); - outReg2 = _mm256_extractf128_si256(outReg32b1, 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); - + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg; - - // load the first 16 bytes of the last row - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg1 = convolve8_8_avx2(s, f); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); - // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg2 = convolve8_8_avx2(s, f); + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively + // shrink to 8 bit each 16 bits outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary @@ -177,11 +166,63 @@ static void vpx_filter_block1d16_h8_avg_avx2( output_height, filter, 1); } +static void vpx_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m256i filt[4], f1[4], s1[4], srcReg; + __m128i f[4], s[4]; + int y = output_height; + + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + shuffle_filter_avx2(filter, f1); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // Process next 4 rows + while (y > 3) { + CALC_CONVOLVE8_HORZ_ROW + CALC_CONVOLVE8_HORZ_ROW + y -= 4; + } + + // If remaining, then process 2 rows at a time + while (y > 1) { + CALC_CONVOLVE8_HORZ_ROW + y -= 2; + } + + // For the remaining height. + if (y > 0) { + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + f[0] = _mm256_castsi256_si128(f1[0]); + f[1] = _mm256_castsi256_si128(f1[1]); + f[2] = _mm256_castsi256_si128(f1[2]); + f[3] = _mm256_castsi256_si128(f1[3]); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); + s[0] = convolve8_8_ssse3(s, f); + + // Saturate 16bit value to 8bit. + s[0] = _mm_packus_epi16(s[0], s[0]); + + // Save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + } +} + static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i outReg1, outReg2; __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; @@ -260,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(s1[0]); - outReg2 = _mm256_extractf128_si256(s1[0], 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); } - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); output_ptr += dst_stride; @@ -534,9 +570,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_dst_stride = dst_stride << 1; int h; - __m256i src_reg, src_reg_shift_0, src_reg_shift_2; - __m256i dst_reg; - __m256i tmp_0, tmp_1; __m256i idx_shift_0 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); @@ -557,9 +590,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, for (h = height; h >= 2; h -= 2) { // Load the source - src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); - src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); - src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + __m256i dst_reg; + __m256i tmp_0, tmp_1; + const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); // Get the output tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); @@ -580,9 +615,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, // Repeat for the last row if needed if (h > 0) { - __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); __m128i dst_reg; - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i tmp_0, tmp_1; __m128i src_reg_shift_0 = @@ -596,7 +631,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, _mm256_castsi256_si128(kernel_reg_45)); dst_reg = _mm_adds_epi16(tmp_0, tmp_1); - dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32_128, 6); dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); @@ -715,8 +750,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_src_stride = src_stride << 1; const ptrdiff_t unrolled_dst_stride = dst_stride << 1; - __m256i src_reg, src_reg_shuf; - __m256i dst; __m256i shuf_idx = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); @@ -733,12 +766,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, for (h = height; h > 1; h -= 2) { // Load the source - src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr, - (const __m128i *)(src_ptr + src_stride)); - src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + const __m256i src_reg = mm256_loadu2_epi64( + (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride)); + const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); // Get the result - dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); // Round result @@ -757,7 +790,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, if (h > 0) { // Load the source - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); __m128i src_reg_shuf = _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); @@ -768,7 +801,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); // Round result - dst = mm_round_epi16_sse2(&dst, ®_32, 6); + dst = mm_round_epi16_sse2(&dst, ®_32_128, 6); // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); @@ -866,22 +899,399 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, } } +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + +static void vpx_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64_256bit; + unsigned int y = output_height; + + assert(output_height > 1); + + addFilterReg64_256bit = _mm256_set1_epi16(32); + + // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit) + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each) + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + { + ptrdiff_t src_stride; + __m256i filt1Reg, filt2Reg, firstFilters, secondFilters; + // have the same data in both lanes of a 256 bit register + // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0 + // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each) + const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); + + // duplicate only the first 32 bits + // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1 + // f0|f3 f2 f1 f0|f3 f2 f1 f0 + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5 + // f4|f7 f6 f5 f4|f7 f6 f5 f4 + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s2 s4 s3 s2 s1 s3 s2 s1 s0 + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + + // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7 + // s6 s8 s7 s6 s5 s7 s6 s5 s4 + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + + do { + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1; + // load the 2 strides of source + // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 + // r06 r05 r04 r03 r02 r01 r00 + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); + + // filter the source buffer + // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 + // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00 + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||... + // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010 + // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010 + // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = + _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit) + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + + // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit) + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + // save first row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1)); + output_ptr += output_pitch; + + // save second row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += output_pitch; + + y = y - 2; + } while (y > 1); + + // For remaining height + if (y > 0) { + __m128i srcReg1, srcRegFilt1_1, addFilterReg64; + __m128i srcRegFilt2; + + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = + _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } + } +} + +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[9], rr[2]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + s[5] = s[9]; + + r[4] = r[8]; + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4], r1[4]; + + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + } +} + #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 -#else // VPX_ARCH_X86 +#else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; @@ -897,7 +1307,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3; filter8_1dfunction vpx_filter_block1d8_h2_ssse3; filter8_1dfunction vpx_filter_block1d4_v2_ssse3; filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 |