20 files changed, 156 insertions, 1333 deletions
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 3cba258f6..9da2f34c9 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -464,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
diff --git a/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
index e4e1e0e7a..c6e70f744 100644
--- a/libvpx/vpx_dsp/x86/avg_pred_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
         r = _mm_loadu_si128((const __m128i *)ref);
         ref += 16;
       } else if (width == 4) {
-        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
-                          loadu_uint32(ref + 2 * ref_stride),
-                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));
+        r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+                          loadu_int32(ref + 2 * ref_stride),
+                          loadu_int32(ref + ref_stride), loadu_int32(ref));
 
         ref += 4 * ref_stride;
       } else {
diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 320962561..01a52ec8b 100644
--- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
-HIGH_FUN_CONV_2D(, avx2, 0);
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
-HIGH_FUN_CONV_2D(avg_, avx2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
 
 #undef HIGHBD_FUNC
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 7149e4fb7..4535a0f7a 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -18,7 +18,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -39,8 +39,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
@@ -94,8 +92,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -107,8 +105,6 @@ void vpx_highbd_quantize_b_32x32_sse2(
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index dd6cfbb2c..7c8d79b09 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -121,8 +121,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -167,16 +167,16 @@ HIGH_GET_VAR(8);
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
 
@@ -255,10 +255,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
       const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -383,20 +383,20 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-
-FNS(sse2);
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -412,7 +412,7 @@ FNS(sse2);
   DECL(16, opt1)    \
   DECL(8, opt1)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
@@ -542,20 +542,20 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt1)                        \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
+#define FNS(opt1)                       \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index b6ff24834..347c9fdbe 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -211,21 +211,21 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
index 258ab38e6..8b6d4d1dd 100644
--- a/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -16,12 +16,12 @@
 
 #include "./vpx_config.h"
 
-static INLINE void storeu_uint32(void *dst, uint32_t v) {
+static INLINE void storeu_int32(void *dst, int32_t v) {
   memcpy(dst, &v, sizeof(v));
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
index 0a91d36ea..706e4e641 100644
--- a/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -21,8 +21,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan,
@@ -39,8 +39,6 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   *eob_ptr = 0;
 
@@ -145,8 +143,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                               const int16_t *quant_ptr,
                               const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -166,8 +163,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index e38a4059a..459d95f28 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
@@ -34,8 +34,6 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
index fc1d91959..9d2a88b7b 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -34,8 +34,6 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
@@ -111,7 +109,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -131,8 +129,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libvpx/vpx_dsp/x86/sad4d_avx2.c
index a5c4f8c53..399b67b3f 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -69,63 +73,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride,
-                         uint32_t *sad_array) {
-  int i;
-  __m256i sums[8];
-
-  sums[0] = _mm256_setzero_si256();
-  sums[1] = _mm256_setzero_si256();
-  sums[2] = _mm256_setzero_si256();
-  sums[3] = _mm256_setzero_si256();
-  sums[4] = _mm256_setzero_si256();
-  sums[5] = _mm256_setzero_si256();
-  sums[6] = _mm256_setzero_si256();
-  sums[7] = _mm256_setzero_si256();
-
-  for (i = 0; i < 32; i++) {
-    __m256i r[8];
-
-    // load src and all ref[]
-    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
-    r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
-    r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
-    r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
-    r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
-    r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
-    r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
-    r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
-    r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
-    // sum of the absolute differences between every ref[] to src
-    r[0] = _mm256_sad_epu8(r[0], s);
-    r[1] = _mm256_sad_epu8(r[1], s);
-    r[2] = _mm256_sad_epu8(r[2], s);
-    r[3] = _mm256_sad_epu8(r[3], s);
-    r[4] = _mm256_sad_epu8(r[4], s);
-    r[5] = _mm256_sad_epu8(r[5], s);
-    r[6] = _mm256_sad_epu8(r[6], s);
-    r[7] = _mm256_sad_epu8(r[7], s);
-
-    // sum every ref[]
-    sums[0] = _mm256_add_epi32(sums[0], r[0]);
-    sums[1] = _mm256_add_epi32(sums[1], r[1]);
-    sums[2] = _mm256_add_epi32(sums[2], r[2]);
-    sums[3] = _mm256_add_epi32(sums[3], r[3]);
-    sums[4] = _mm256_add_epi32(sums[4], r[4]);
-    sums[5] = _mm256_add_epi32(sums[5], r[5]);
-    sums[6] = _mm256_add_epi32(sums[6], r[6]);
-    sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  calc_final_4(sums, sad_array);
-  calc_final_4(sums + 4, sad_array + 4);
-}
-
 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t sad_array[4]) {
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c
index 4c5d70464..cfd23fedd 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx512.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -13,7 +13,7 @@
 
 void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t res[4]) {
+                            uint32_t sad_array[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
     sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
                            _mm256_extractf128_si256(sum256, 1));
 
-    _mm_storeu_si128((__m128i *)(res), sum128);
+    _mm_storeu_si128((__m128i *)(sad_array), sum128);
   }
 }
diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
index d94413430..3b48acd51 100644
--- a/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -71,17 +71,17 @@
     return res;                                                               \
   }
 
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+#define FSAD64 \
+  FSAD64_H(64) \
+  FSAD64_H(32)
 
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+#define FSAD32 \
+  FSAD32_H(64) \
+  FSAD32_H(32) \
+  FSAD32_H(16)
 
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
 
 #undef FSAD64
 #undef FSAD32
@@ -160,17 +160,17 @@ FSAD32;
     return res;                                                               \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
 
 #undef FSADAVG64
 #undef FSADAVG32
diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa..000000000
--- a/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0..000000000
--- a/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d730..000000000
--- a/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
-        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
-        call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vpx_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
-        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
-        call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index 37ef64eca..a67c92aad 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
 }
@@ -471,23 +471,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
@@ -543,23 +543,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 239179028..0cbd151dc 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
-            sse2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+            sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2, 0);
-FUN_CONV_2D(avg_, sse2, 1);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
 
 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2, 0);
-HIGH_FUN_CONV_2D(avg_, sse2, 1);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 1eaa19bfc..6f2983a4b 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -969,12 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            avx2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+            avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -986,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2, 0);
-FUN_CONV_2D(avg_, avx2, 1);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 77355a208..ed46d6245 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            ssse3, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+            ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3, 0);
-FUN_CONV_2D(avg_, ssse3, 1);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)