diff options
Diffstat (limited to 'vpx_dsp/x86/highbd_sad_avx2.c')
-rw-r--r-- | vpx_dsp/x86/highbd_sad_avx2.c | 188 |
1 files changed, 121 insertions, 67 deletions
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 231b67f80..78f8eb8bf 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, } } -#define HIGHBD_SAD64XN(n) \ - unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 2); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 1; \ - ref += ref_stride << 1; \ - } \ - return calc_final(sums_32); \ +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; } + return calc_final(sums_32); +} -// 64x64 -HIGHBD_SAD64XN(64) +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } -// 64x32 -HIGHBD_SAD64XN(32) +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, } } -#define HIGHBD_SAD32XN(n) \ - unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 8); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 3; \ - ref += ref_stride << 3; \ - } \ - return calc_final(sums_32); \ - } +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; -// 32x64 -HIGHBD_SAD32XN(64) + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); -// 32x32 -HIGHBD_SAD32XN(32) + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); -// 32x16 -HIGHBD_SAD32XN(16) + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { __m256i sums_16 = _mm256_setzero_si256(); - highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32 = _mm256_add_epi32( @@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, return calc_final(sums_32); } +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); @@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, } } +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + // AVG ------------------------------------------------------------------------- static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, const uint16_t *src, |