aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vpx_dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vpx_dsp/x86')
-rw-r--r--libvpx/vpx_dsp/x86/avg_intrin_sse2.c2
-rw-r--r--libvpx/vpx_dsp/x86/avg_pred_sse2.c6
-rw-r--r--libvpx/vpx_dsp/x86/highbd_convolve_avx2.c12
-rw-r--r--libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c10
-rw-r--r--libvpx/vpx_dsp/x86/highbd_variance_sse2.c86
-rw-r--r--libvpx/vpx_dsp/x86/loopfilter_sse2.c16
-rw-r--r--libvpx/vpx_dsp/x86/mem_sse2.h6
-rw-r--r--libvpx/vpx_dsp/x86/quantize_avx.c11
-rw-r--r--libvpx/vpx_dsp/x86/quantize_sse2.c6
-rw-r--r--libvpx/vpx_dsp/x86/quantize_ssse3.c10
-rw-r--r--libvpx/vpx_dsp/x86/sad4d_avx2.c63
-rw-r--r--libvpx/vpx_dsp/x86/sad4d_avx512.c4
-rw-r--r--libvpx/vpx_dsp/x86/sad_avx2.c36
-rw-r--r--libvpx/vpx_dsp/x86/sad_sse3.asm376
-rw-r--r--libvpx/vpx_dsp/x86/sad_sse4.asm361
-rw-r--r--libvpx/vpx_dsp/x86/sad_ssse3.asm372
-rw-r--r--libvpx/vpx_dsp/x86/variance_sse2.c64
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c24
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c12
-rw-r--r--libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c12
20 files changed, 156 insertions, 1333 deletions
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 3cba258f6..9da2f34c9 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -464,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) {
return _mm_cvtsi128_si32(accum);
}
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
const int ref_stride, const int height) {
int idx;
__m128i zero = _mm_setzero_si128();
diff --git a/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
index e4e1e0e7a..c6e70f744 100644
--- a/libvpx/vpx_dsp/x86/avg_pred_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
r = _mm_loadu_si128((const __m128i *)ref);
ref += 16;
} else if (width == 4) {
- r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
- loadu_uint32(ref + 2 * ref_stride),
- loadu_uint32(ref + ref_stride), loadu_uint32(ref));
+ r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+ loadu_int32(ref + 2 * ref_stride),
+ loadu_int32(ref + ref_stride), loadu_int32(ref));
ref += 4 * ref_stride;
} else {
diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 320962561..01a52ec8b 100644
--- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
vpx_highbd_filter_block1d4_h8_avg_avx2
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), , avx2, 0);
-HIGH_FUN_CONV_2D(, avx2, 0);
+ src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
vpx_highbd_filter_block1d4_v2_avg_sse2
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
-HIGH_FUN_CONV_2D(avg_, avx2, 1);
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
#undef HIGHBD_FUNC
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 7149e4fb7..4535a0f7a 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -18,7 +18,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -39,8 +39,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
(void)scan;
- (void)skip_block;
- assert(!skip_block);
memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
@@ -94,8 +92,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
}
void vpx_highbd_quantize_b_32x32_sse2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -107,8 +105,6 @@ void vpx_highbd_quantize_b_32x32_sse2(
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
(void)scan;
- (void)skip_block;
- assert(!skip_block);
zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index dd6cfbb2c..7c8d79b09 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -121,8 +121,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
#undef HIGH_GET_VAR
@@ -167,16 +167,16 @@ HIGH_GET_VAR(8);
return (var >= 0) ? (uint32_t)var : 0; \
}
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
#undef VAR_FN
@@ -255,10 +255,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint16_t *ref, ptrdiff_t ref_stride, int height, \
unsigned int *sse, void *unused0, void *unused);
#define DECLS(opt) \
- DECL(8, opt); \
+ DECL(8, opt) \
DECL(16, opt)
-DECLS(sse2);
+DECLS(sse2)
#undef DECLS
#undef DECL
@@ -383,20 +383,20 @@ DECLS(sse2);
return (var >= 0) ? (uint32_t)var : 0; \
}
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int64_t)); \
- FN(8, 16, 8, 3, 4, opt, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt, (int64_t));
-
-FNS(sse2);
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
#undef FNS
#undef FN
@@ -412,7 +412,7 @@ FNS(sse2);
DECL(16, opt1) \
DECL(8, opt1)
-DECLS(sse2);
+DECLS(sse2)
#undef DECL
#undef DECLS
@@ -542,20 +542,20 @@ DECLS(sse2);
return (var >= 0) ? (uint32_t)var : 0; \
}
-#define FNS(opt1) \
- FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
- FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
+#define FNS(opt1) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int64_t)) \
+ FN(8, 16, 8, 4, 3, opt1, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
#undef FNS
#undef FN
diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index b6ff24834..347c9fdbe 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -211,21 +211,21 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
- storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
- storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
}
void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
index 258ab38e6..8b6d4d1dd 100644
--- a/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -16,12 +16,12 @@
#include "./vpx_config.h"
-static INLINE void storeu_uint32(void *dst, uint32_t v) {
+static INLINE void storeu_int32(void *dst, int32_t v) {
memcpy(dst, &v, sizeof(v));
}
-static INLINE uint32_t loadu_uint32(const void *src) {
- uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+ int32_t v;
memcpy(&v, src, sizeof(v));
return v;
}
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
index 0a91d36ea..706e4e641 100644
--- a/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -21,8 +21,8 @@
#include "vpx_dsp/x86/quantize_ssse3.h"
void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
@@ -39,8 +39,6 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i eob = zero, eob0;
(void)scan;
- (void)skip_block;
- assert(!skip_block);
*eob_ptr = 0;
@@ -145,8 +143,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
}
void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -166,8 +163,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
(void)scan;
(void)n_coeffs;
- (void)skip_block;
- assert(!skip_block);
// Setup global values.
// The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index e38a4059a..459d95f28 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -18,8 +18,8 @@
#include "vpx_dsp/x86/quantize_sse2.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
@@ -34,8 +34,6 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i eob, eob0;
(void)scan;
- (void)skip_block;
- assert(!skip_block);
// Setup global values.
load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
index fc1d91959..9d2a88b7b 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -18,8 +18,8 @@
#include "vpx_dsp/x86/quantize_ssse3.h"
void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -34,8 +34,6 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i eob, eob0;
(void)scan;
- (void)skip_block;
- assert(!skip_block);
load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
dequant_ptr, &dequant, quant_shift_ptr, &shift);
@@ -111,7 +109,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
}
void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -131,8 +129,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
(void)scan;
(void)n_coeffs;
- (void)skip_block;
- assert(!skip_block);
// Setup global values.
// The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libvpx/vpx_dsp/x86/sad4d_avx2.c
index a5c4f8c53..399b67b3f 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,12 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
- uint32_t *sad_array) {
+ uint32_t sad_array[4]) {
const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -69,63 +73,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sad_array) {
- int i;
- __m256i sums[8];
-
- sums[0] = _mm256_setzero_si256();
- sums[1] = _mm256_setzero_si256();
- sums[2] = _mm256_setzero_si256();
- sums[3] = _mm256_setzero_si256();
- sums[4] = _mm256_setzero_si256();
- sums[5] = _mm256_setzero_si256();
- sums[6] = _mm256_setzero_si256();
- sums[7] = _mm256_setzero_si256();
-
- for (i = 0; i < 32; i++) {
- __m256i r[8];
-
- // load src and all ref[]
- const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
- r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
- r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
- r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
- r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
- r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
- r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
- r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
- r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
- // sum of the absolute differences between every ref[] to src
- r[0] = _mm256_sad_epu8(r[0], s);
- r[1] = _mm256_sad_epu8(r[1], s);
- r[2] = _mm256_sad_epu8(r[2], s);
- r[3] = _mm256_sad_epu8(r[3], s);
- r[4] = _mm256_sad_epu8(r[4], s);
- r[5] = _mm256_sad_epu8(r[5], s);
- r[6] = _mm256_sad_epu8(r[6], s);
- r[7] = _mm256_sad_epu8(r[7], s);
-
- // sum every ref[]
- sums[0] = _mm256_add_epi32(sums[0], r[0]);
- sums[1] = _mm256_add_epi32(sums[1], r[1]);
- sums[2] = _mm256_add_epi32(sums[2], r[2]);
- sums[3] = _mm256_add_epi32(sums[3], r[3]);
- sums[4] = _mm256_add_epi32(sums[4], r[4]);
- sums[5] = _mm256_add_epi32(sums[5], r[5]);
- sums[6] = _mm256_add_epi32(sums[6], r[6]);
- sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- calc_final_4(sums, sad_array);
- calc_final_4(sums + 4, sad_array + 4);
-}
-
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c
index 4c5d70464..cfd23fedd 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx512.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -13,7 +13,7 @@
void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t res[4]) {
+ uint32_t sad_array[4]) {
__m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
__m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
_mm256_extractf128_si256(sum256, 1));
- _mm_storeu_si128((__m128i *)(res), sum128);
+ _mm_storeu_si128((__m128i *)(sad_array), sum128);
}
}
diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
index d94413430..3b48acd51 100644
--- a/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -71,17 +71,17 @@
return res; \
}
-#define FSAD64 \
- FSAD64_H(64); \
- FSAD64_H(32);
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32)
-#define FSAD32 \
- FSAD32_H(64); \
- FSAD32_H(32); \
- FSAD32_H(16);
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16)
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
#undef FSAD64
#undef FSAD32
@@ -160,17 +160,17 @@ FSAD32;
return res; \
}
-#define FSADAVG64 \
- FSADAVG64_H(64); \
- FSADAVG64_H(32);
+#define FSADAVG64 \
+ FSADAVG64_H(64) \
+ FSADAVG64_H(32)
-#define FSADAVG32 \
- FSADAVG32_H(64); \
- FSADAVG32_H(32); \
- FSADAVG32_H(16);
+#define FSADAVG32 \
+ FSADAVG32_H(64) \
+ FSADAVG32_H(32) \
+ FSADAVG32_H(16)
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
#undef FSADAVG64
#undef FSADAVG32
diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa..000000000
--- a/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm5, XMMWORD PTR [%3]
- lddqu xmm6, XMMWORD PTR [%3+1]
- lddqu xmm7, XMMWORD PTR [%3+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%3+1]
- lddqu xmm3, XMMWORD PTR [%3+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%4]
- lddqu xmm1, XMMWORD PTR [%3+%5]
- lddqu xmm2, XMMWORD PTR [%3+%5+1]
- lddqu xmm3, XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm5, QWORD PTR [%3]
- movq mm6, QWORD PTR [%3+1]
- movq mm7, QWORD PTR [%3+2]
-
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%3+1]
- movq mm3, QWORD PTR [%3+2]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endif
- movq mm0, QWORD PTR [%2+%4]
- movq mm1, QWORD PTR [%3+%5]
- movq mm2, QWORD PTR [%3+%5+1]
- movq mm3, QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [ref_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [ref_ptr+1]
- movd mm5, DWORD PTR [ref_ptr+2]
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- psadbw mm4, mm0
- psadbw mm5, mm0
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [ref_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm6, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm6
-
- movd mm3, DWORD PTR [ref_ptr+1]
- movd mm7, DWORD PTR [ref_ptr+2]
-
- psadbw mm2, mm0
-
- paddw mm1, mm2
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm6
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- mov rcx, result_ptr
-
- punpckldq mm1, mm3
-
- movq [rcx], mm1
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0..000000000
--- a/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm1, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endif
- movdqa xmm0, XMMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- movq xmm2, MMWORD PTR [rdi+ rdx+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
- movq xmm0, MMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm1, xmm2
-%else
- movq xmm0, MMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endif
- movq xmm0, MMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
- movd xmm0, [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- mpsadbw xmm1, xmm0, 0x0
-%else
- movd xmm0, [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endif
- movd xmm0, [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
- mov rdi, arg(4) ;Results
- pxor xmm0, xmm0
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
-
- movdqa [rdi], xmm1
- movdqa [rdi + 16], xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d730..000000000
--- a/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm7, XMMWORD PTR [rdi+16]
-
- movdqa xmm5, xmm7
- palignr xmm5, xmm4, %2
-
- movdqa xmm6, xmm7
- palignr xmm6, xmm4, (%2+1)
-
- palignr xmm7, xmm4, (%2+2)
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm3, XMMWORD PTR [rdi+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- movdqa xmm4, XMMWORD PTR [rdi+rdx]
- movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
- dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
- call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void int vpx_sad16x8x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
- dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
- call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index 37ef64eca..a67c92aad 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
}
static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
- const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
- const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
}
@@ -471,23 +471,23 @@ DECLS(ssse3, ssse3);
(unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
}
-#define FNS(opt1, opt2) \
- FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \
FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
#undef FNS
#undef FN
@@ -543,23 +543,23 @@ DECLS(ssse3, ssse3);
(unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
}
-#define FNS(opt1, opt2) \
- FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
+#define FNS(opt1, opt2) \
+ FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \
FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
#undef FNS
#undef FN
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 239179028..0cbd151dc 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
- sse2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+ sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
- src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+ src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, sse2, 0);
-FUN_CONV_2D(avg_, sse2, 1);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), , sse2, 0);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+ src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+ src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2, 0);
-HIGH_FUN_CONV_2D(avg_, sse2, 1);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 1eaa19bfc..6f2983a4b 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -969,12 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
- avx2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+ avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -986,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, avx2, 0);
-FUN_CONV_2D(avg_, avx2, 1);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 77355a208..ed46d6245 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
- ssse3, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+ ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
- src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
+ src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
static void filter_horiz_w8_ssse3(const uint8_t *const src,
const ptrdiff_t src_stride,
@@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, ssse3, 0);
-FUN_CONV_2D(avg_, ssse3, 1);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)