diff options
Diffstat (limited to 'libvpx/vpx_dsp/x86')
-rw-r--r-- | libvpx/vpx_dsp/x86/avg_intrin_sse2.c | 2 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/avg_pred_sse2.c | 6 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/highbd_convolve_avx2.c | 12 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 10 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/highbd_variance_sse2.c | 86 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/loopfilter_sse2.c | 16 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/mem_sse2.h | 6 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/quantize_avx.c | 11 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/quantize_sse2.c | 6 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/quantize_ssse3.c | 10 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad4d_avx2.c | 63 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad4d_avx512.c | 4 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad_avx2.c | 36 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad_sse3.asm | 376 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad_sse4.asm | 361 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/sad_ssse3.asm | 372 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/variance_sse2.c | 64 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 24 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 12 | ||||
-rw-r--r-- | libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 12 |
20 files changed, 156 insertions, 1333 deletions
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index 3cba258f6..9da2f34c9 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -464,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) { return _mm_cvtsi128_si32(accum); } -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); diff --git a/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libvpx/vpx_dsp/x86/avg_pred_sse2.c index e4e1e0e7a..c6e70f744 100644 --- a/libvpx/vpx_dsp/x86/avg_pred_sse2.c +++ b/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, r = _mm_loadu_si128((const __m128i *)ref); ref += 16; } else if (width == 4) { - r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride), - loadu_uint32(ref + 2 * ref_stride), - loadu_uint32(ref + ref_stride), loadu_uint32(ref)); + r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride), + loadu_int32(ref + 2 * ref_stride), + loadu_int32(ref + ref_stride), loadu_int32(ref)); ref += 4 * ref_stride; } else { diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c index 320962561..01a52ec8b 100644 --- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; #define vpx_highbd_filter_block1d4_h4_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_avx2 -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), , avx2, 0); -HIGH_FUN_CONV_2D(, avx2, 0); + src - src_stride * (num_taps / 2 - 1), , avx2, 0) +HIGH_FUN_CONV_2D(, avx2, 0) // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; @@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); -HIGH_FUN_CONV_2D(avg_, avx2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) +HIGH_FUN_CONV_2D(avg_, avx2, 1) #undef HIGHBD_FUNC diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 7149e4fb7..4535a0f7a 100644 --- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -18,7 +18,7 @@ #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -39,8 +39,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); @@ -94,8 +92,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -107,8 +105,6 @@ void vpx_highbd_quantize_b_32x32_sse2( const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; - (void)skip_block; - assert(!skip_block); zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index dd6cfbb2c..7c8d79b09 100644 --- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -121,8 +121,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, *sse = ROUND_POWER_OF_TWO(*sse, 8); \ } -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); +HIGH_GET_VAR(16) +HIGH_GET_VAR(8) #undef HIGH_GET_VAR @@ -167,16 +167,16 @@ HIGH_GET_VAR(8); return (var >= 0) ? (uint32_t)var : 0; \ } -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) #undef VAR_FN @@ -255,10 +255,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint16_t *ref, ptrdiff_t ref_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ - DECL(8, opt); \ + DECL(8, opt) \ DECL(16, opt) -DECLS(sse2); +DECLS(sse2) #undef DECLS #undef DECL @@ -383,20 +383,20 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); - -FNS(sse2); +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) + +FNS(sse2) #undef FNS #undef FN @@ -412,7 +412,7 @@ FNS(sse2); DECL(16, opt1) \ DECL(8, opt1) -DECLS(sse2); +DECLS(sse2) #undef DECL #undef DECLS @@ -542,20 +542,20 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt1) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ - FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int64_t)); - -FNS(sse2); +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)) \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)) + +FNS(sse2) #undef FNS #undef FN diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c index b6ff24834..347c9fdbe 100644 --- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -211,21 +211,21 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); - storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); } void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h index 258ab38e6..8b6d4d1dd 100644 --- a/libvpx/vpx_dsp/x86/mem_sse2.h +++ b/libvpx/vpx_dsp/x86/mem_sse2.h @@ -16,12 +16,12 @@ #include "./vpx_config.h" -static INLINE void storeu_uint32(void *dst, uint32_t v) { +static INLINE void storeu_int32(void *dst, int32_t v) { memcpy(dst, &v, sizeof(v)); } -static INLINE uint32_t loadu_uint32(const void *src) { - uint32_t v; +static INLINE int32_t loadu_int32(const void *src) { + int32_t v; memcpy(&v, src, sizeof(v)); return v; } diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c index 0a91d36ea..706e4e641 100644 --- a/libvpx/vpx_dsp/x86/quantize_avx.c +++ b/libvpx/vpx_dsp/x86/quantize_avx.c @@ -21,8 +21,8 @@ #include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -39,8 +39,6 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); *eob_ptr = 0; @@ -145,8 +143,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -166,8 +163,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; (void)n_coeffs; - (void)skip_block; - assert(!skip_block); // Setup global values. // The 32x32 halves zbin and round. diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c index e38a4059a..459d95f28 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -18,8 +18,8 @@ #include "vpx_dsp/x86/quantize_sse2.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -34,8 +34,6 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); // Setup global values. load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c index fc1d91959..9d2a88b7b 100644 --- a/libvpx/vpx_dsp/x86/quantize_ssse3.c +++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -18,8 +18,8 @@ #include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -34,8 +34,6 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); @@ -111,7 +109,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -131,8 +129,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; (void)n_coeffs; - (void)skip_block; - assert(!skip_block); // Setup global values. // The 32x32 halves zbin and round. diff --git a/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libvpx/vpx_dsp/x86/sad4d_avx2.c index a5c4f8c53..399b67b3f 100644 --- a/libvpx/vpx_dsp/x86/sad4d_avx2.c +++ b/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -11,8 +11,12 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +// Note with sums[4] some versions of Visual Studio may fail due to parameter +// alignment, though the functions should be equivalent: +// error C2719: 'sums': formal parameter with requested alignment of 32 won't be +// aligned static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, - uint32_t *sad_array) { + uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); @@ -69,63 +73,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - __m256i sums[8]; - - sums[0] = _mm256_setzero_si256(); - sums[1] = _mm256_setzero_si256(); - sums[2] = _mm256_setzero_si256(); - sums[3] = _mm256_setzero_si256(); - sums[4] = _mm256_setzero_si256(); - sums[5] = _mm256_setzero_si256(); - sums[6] = _mm256_setzero_si256(); - sums[7] = _mm256_setzero_si256(); - - for (i = 0; i < 32; i++) { - __m256i r[8]; - - // load src and all ref[] - const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); - r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]); - r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]); - r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]); - r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]); - r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]); - r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]); - r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]); - r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]); - - // sum of the absolute differences between every ref[] to src - r[0] = _mm256_sad_epu8(r[0], s); - r[1] = _mm256_sad_epu8(r[1], s); - r[2] = _mm256_sad_epu8(r[2], s); - r[3] = _mm256_sad_epu8(r[3], s); - r[4] = _mm256_sad_epu8(r[4], s); - r[5] = _mm256_sad_epu8(r[5], s); - r[6] = _mm256_sad_epu8(r[6], s); - r[7] = _mm256_sad_epu8(r[7], s); - - // sum every ref[] - sums[0] = _mm256_add_epi32(sums[0], r[0]); - sums[1] = _mm256_add_epi32(sums[1], r[1]); - sums[2] = _mm256_add_epi32(sums[2], r[2]); - sums[3] = _mm256_add_epi32(sums[3], r[3]); - sums[4] = _mm256_add_epi32(sums[4], r[4]); - sums[5] = _mm256_add_epi32(sums[5], r[5]); - sums[6] = _mm256_add_epi32(sums[6], r[6]); - sums[7] = _mm256_add_epi32(sums[7], r[7]); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - calc_final_4(sums, sad_array); - calc_final_4(sums + 4, sad_array + 4); -} - void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c index 4c5d70464..cfd23fedd 100644 --- a/libvpx/vpx_dsp/x86/sad4d_avx512.c +++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -13,7 +13,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t res[4]) { + uint32_t sad_array[4]) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m512i sum_mlow, sum_mhigh; @@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), _mm256_extractf128_si256(sum256, 1)); - _mm_storeu_si128((__m128i *)(res), sum128); + _mm_storeu_si128((__m128i *)(sad_array), sum128); } } diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c index d94413430..3b48acd51 100644 --- a/libvpx/vpx_dsp/x86/sad_avx2.c +++ b/libvpx/vpx_dsp/x86/sad_avx2.c @@ -71,17 +71,17 @@ return res; \ } -#define FSAD64 \ - FSAD64_H(64); \ - FSAD64_H(32); +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) -#define FSAD32 \ - FSAD32_H(64); \ - FSAD32_H(32); \ - FSAD32_H(16); +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) -FSAD64; -FSAD32; +FSAD64 +FSAD32 #undef FSAD64 #undef FSAD32 @@ -160,17 +160,17 @@ FSAD32; return res; \ } -#define FSADAVG64 \ - FSADAVG64_H(64); \ - FSADAVG64_H(32); +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) -#define FSADAVG32 \ - FSADAVG32_H(64); \ - FSADAVG32_H(32); \ - FSADAVG32_H(16); +#define FSADAVG32 \ + FSADAVG32_H(64) \ + FSADAVG32_H(32) \ + FSADAVG32_H(16) -FSADAVG64; -FSADAVG32; +FSADAVG64 +FSADAVG32 #undef FSADAVG64 #undef FSADAVG32 diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm deleted file mode 100644 index acbd2e4fa..000000000 --- a/libvpx/vpx_dsp/x86/sad_sse3.asm +++ /dev/null @@ -1,376 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -SECTION .text - -;void int vpx_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x16x3_sse3) -sym(vpx_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x8x3_sse3) -sym(vpx_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad8x16x3_sse3) -sym(vpx_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad8x8x3_sse3) -sym(vpx_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad4x4x3_sse3) -sym(vpx_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm deleted file mode 100644 index 0818ed5f0..000000000 --- a/libvpx/vpx_dsp/x86/sad_sse4.asm +++ /dev/null @@ -1,361 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -SECTION .text - -;void vpx_sad16x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -globalsym(vpx_sad16x16x8_sse4_1) -sym(vpx_sad16x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad16x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad16x8x8_sse4_1) -sym(vpx_sad16x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad8x8x8_sse4_1) -sym(vpx_sad8x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad8x16x8_sse4_1) -sym(vpx_sad8x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad4x4x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad4x4x8_sse4_1) -sym(vpx_sad4x4x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm deleted file mode 100644 index a5bc6d730..000000000 --- a/libvpx/vpx_dsp/x86/sad_ssse3.asm +++ /dev/null @@ -1,372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -SECTION .text - -;void int vpx_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x16x3_ssse3) -sym(vpx_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x16x3_ssse3_skiptable -.vpx_sad16x16x3_ssse3_jumptable: - dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_skiptable: - - call .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 - -.vpx_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int vpx_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x8x3_ssse3) -sym(vpx_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x8x3_ssse3_skiptable -.vpx_sad16x8x3_ssse3_jumptable: - dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_skiptable: - - call .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 - -.vpx_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c index 37ef64eca..a67c92aad 100644 --- a/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libvpx/vpx_dsp/x86/variance_sse2.c @@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { } static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { - const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride)); - const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride)); + const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); const __m128i p01 = _mm_unpacklo_epi32(p0, p1); return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); } @@ -471,23 +471,23 @@ DECLS(ssse3, ssse3); (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) -FNS(sse2, sse2); -FNS(ssse3, ssse3); +FNS(sse2, sse2) +FNS(ssse3, ssse3) #undef FNS #undef FN @@ -543,23 +543,23 @@ DECLS(ssse3, ssse3); (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) -FNS(sse2, sse); -FNS(ssse3, ssse3); +FNS(sse2, sse) +FNS(ssse3, ssse3) #undef FNS #undef FN diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 239179028..0cbd151dc 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , - sse2, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); + sse2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1); + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1) // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, sse2, 0); -FUN_CONV_2D(avg_, sse2, 1); +FUN_CONV_2D(, sse2, 0) +FUN_CONV_2D(avg_, sse2, 1) #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. @@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), , sse2, 0); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); + src - src_stride * (num_taps / 2 - 1), , sse2, 0) +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1) // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2, 0); -HIGH_FUN_CONV_2D(avg_, sse2, 1); +HIGH_FUN_CONV_2D(, sse2, 0) +HIGH_FUN_CONV_2D(avg_, sse2, 1) #endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 1eaa19bfc..6f2983a4b 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -969,12 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , - avx2, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); + avx2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -986,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, avx2, 0); -FUN_CONV_2D(avg_, avx2, 1); +FUN_CONV_2D(, avx2, 0) +FUN_CONV_2D(avg_, avx2, 1) #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 77355a208..ed46d6245 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , - ssse3, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1); + ssse3, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1); + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1) static void filter_horiz_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, @@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, ssse3, 0); -FUN_CONV_2D(avg_, ssse3, 1); +FUN_CONV_2D(, ssse3, 0) +FUN_CONV_2D(avg_, ssse3, 1) |